mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Load balance thread ranges for arbitrary diagonals.
Details:
- Expanded/updated interface for bli_get_range_weighted() and
bli_get_range() so that the direction of movement is specified in the
function name (e.g. bli_get_range_l2r(), bli_get_range_weighted_t2b())
and also so that the object being partitioned is passed instead of an
uplo parameter. Updated invocations in level-3 blocked variants, as
appropriate.
- (Re)implemented bli_get_range_*() and bli_get_range_weighted_*() to
carefully take into account the location of the diagonal when computing
ranges so that the area of each subpartition (which, in all present
level-3 operations, is proportional to the amount of computation
engendered) is as equal as possible.
- Added calls to a new class of routines to all non-gemm level-3 blocked
variants:
bli_<oper>_prune_unref_mparts_[mnk]()
where <oper> is herk, trmm, or trsm and [mnk] is chosen based on which
dimension is being partitioned. These routines call a more basic
routine, bli_prune_unref_mparts(), to prune unreferenced/unstored
regions from matrices and simultaneously adjust other matrices which
share the same dimension accordingly.
- Simplified herk_blk_var2f, trmm_blk_var1f/b as a result of more the
new pruning routines.
- Fixed incorrect blocking factors passed into bli_get_range_*() in
bli_trsm_blk_var[12][fb].c
- Added a new test driver in test/thread_ranges that can exercise the new
bli_get_range_*() and bli_get_range_weighted_*() under a range of
conditions.
- Reimplemented m and n fields of obj_t as elements in a "dim"
array field so that dimensions could be queried via index constant
(e.g. BLIS_M, BLIS_N). Adjusted/added query and modification
macros accordingly.
- Defined mdim_t type to enumerate BLIS_M and BLIS_N indexing values.
- Added bli_round() macro, which calls C math library function round(),
and bli_round_to_mult(), which rounds a value to the nearest multiple
of some other value.
- Added miscellaneous pruning- and mdim_t-related macros.
- Renamed bli_obj_row_offset(), bli_obj_col_offset() macros to
bli_obj_row_off(), bli_obj_col_off().
This commit is contained in:
@@ -51,7 +51,6 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
@@ -80,21 +79,19 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
dim_t start, end;
|
||||
bli_get_range_t2b( thread, 0, m_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_t2b( thread, a,
|
||||
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
|
||||
&start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -50,7 +50,6 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
@@ -79,21 +78,19 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range_l2r( thread, 0, n_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_l2r( thread, b,
|
||||
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
|
||||
&start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of b (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
|
||||
@@ -59,7 +59,6 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
@@ -88,21 +87,19 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
dim_t start, end;
|
||||
bli_get_range_t2b( thread, 0, m_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_t2b( thread, a,
|
||||
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
|
||||
&start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "bli_herk_check.h"
|
||||
#include "bli_herk_front.h"
|
||||
#include "bli_herk_int.h"
|
||||
#include "bli_herk_prune.h"
|
||||
|
||||
#include "bli_herk_blk_var1f.h"
|
||||
|
||||
|
||||
@@ -50,7 +50,9 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_m( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A'.
|
||||
@@ -79,18 +81,16 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *c );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted_t2b( thread, 0, m_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_t2b( thread, c,
|
||||
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_root_uplo( *c ), &start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -41,23 +41,18 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t ah1_pack_s, c1S_pack_s;
|
||||
obj_t ah1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t ah1, c1, c1S;
|
||||
obj_t aS_pack;
|
||||
obj_t ah1, c1;
|
||||
obj_t* a_pack;
|
||||
obj_t* ah1_pack;
|
||||
obj_t* c1S_pack;
|
||||
obj_t* c1_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
subpart_t stored_part;
|
||||
|
||||
// The upper and lower variants are identical, except for which
|
||||
// merged subpartition is acquired in the loop body.
|
||||
if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B;
|
||||
else stored_part = BLIS_SUBPART1T;
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_n( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
@@ -75,30 +70,26 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
// Initialize pack objects for C and A' that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &ah1_pack_s );
|
||||
bli_obj_init_pack( &c1S_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
|
||||
c1S_pack = thread_ibroadcast( thread, &c1S_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *c );
|
||||
dim_t start, end;
|
||||
|
||||
// Needs to be replaced with a weighted range because triangle
|
||||
bli_get_range_weighted_l2r( thread, 0, n_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_l2r( thread, c,
|
||||
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_root_uplo( *c ), &start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1' and C1.
|
||||
@@ -107,18 +98,11 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of A_pack.
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, &c1, &c1S );
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, a_pack, &aS_pack );
|
||||
|
||||
// Initialize objects for packing A1' and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &ah1, ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1S, c1S_pack,
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread ) ;
|
||||
@@ -129,23 +113,23 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1S, c1S_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) ) ;
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
&aS_pack,
|
||||
a_pack,
|
||||
ah1_pack,
|
||||
&BLIS_ONE,
|
||||
c1S_pack,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1S_pack, &c1S,
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
@@ -157,7 +141,7 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1S_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,6 +52,9 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_k( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing C.
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
64
frame/3/herk/bli_herk_prune.c
Normal file
64
frame/3/herk/bli_herk_prune.c
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_herk_prune_unref_mparts_m( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of C (that would
|
||||
// be encountered from partitioning in the m dimension) and adjust the
|
||||
// subpartition of A accordingly.
|
||||
bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M );
|
||||
}
|
||||
|
||||
void bli_herk_prune_unref_mparts_n( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of C (that would
|
||||
// be encountered from partitioning in the n dimension) and adjust the
|
||||
// subpartition of Ah accordingly.
|
||||
bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N );
|
||||
}
|
||||
|
||||
void bli_herk_prune_unref_mparts_k( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c )
|
||||
{
|
||||
// As long as A and Ah are general in structure, no pruning should be
|
||||
// for the k dimension.
|
||||
}
|
||||
|
||||
38
frame/3/herk/bli_herk_prune.h
Normal file
38
frame/3/herk/bli_herk_prune.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_herk_prune_unref_mparts_m( obj_t* a, obj_t* ah, obj_t* c );
|
||||
void bli_herk_prune_unref_mparts_n( obj_t* a, obj_t* ah, obj_t* c );
|
||||
void bli_herk_prune_unref_mparts_k( obj_t* a, obj_t* ah, obj_t* c );
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include "bli_trmm_check.h"
|
||||
#include "bli_trmm_front.h"
|
||||
#include "bli_trmm_int.h"
|
||||
#include "bli_trmm_prune.h"
|
||||
|
||||
#include "bli_trmm_blk_var1f.h"
|
||||
|
||||
|
||||
@@ -50,8 +50,9 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_m( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
@@ -81,28 +82,28 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
offA = 0;
|
||||
//m_trans = bli_obj_length_after_trans( *a );
|
||||
//offA = 0;
|
||||
|
||||
// If A is lower triangular, we have to adjust where the non-zero part of
|
||||
// A begins. If A is upper triangular, we have to adjust the length of
|
||||
// the non-zero part. If A is general/dense, then we keep the defaults.
|
||||
if ( bli_obj_is_lower( *a ) )
|
||||
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
|
||||
else if ( bli_obj_is_upper( *a ) )
|
||||
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
|
||||
bli_obj_width_after_trans( *a );
|
||||
//if ( bli_obj_is_lower( *a ) )
|
||||
// offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
|
||||
//else if ( bli_obj_is_upper( *a ) )
|
||||
// m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
|
||||
// bli_obj_width_after_trans( *a );
|
||||
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted_t2b( thread, offA, m_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_t2b( thread, a,
|
||||
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_root_uplo( *a ), &start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -50,8 +50,9 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
@@ -79,18 +80,16 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted_r2l( thread, 0, n_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_r2l( thread, b,
|
||||
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
|
||||
bli_obj_root_uplo( *b ), &start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, end, b,
|
||||
b_alg = bli_determine_blocksize_b( i, my_end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
|
||||
@@ -50,8 +50,9 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
@@ -79,18 +80,16 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted_l2r( thread, 0, n_trans,
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_l2r( thread, b,
|
||||
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
|
||||
bli_obj_root_uplo( *b ), &start, &end );
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
|
||||
@@ -52,6 +52,9 @@ void bli_trmm_blk_var3b( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_k( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
@@ -52,6 +52,9 @@ void bli_trmm_blk_var3f( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_k( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
71
frame/3/trmm/bli_trmm_prune.c
Normal file
71
frame/3/trmm/bli_trmm_prune.c
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_trmm_prune_unref_mparts_m( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of A (that would
|
||||
// be encountered from partitioning in the m dimension) and adjust the
|
||||
// subpartition of C accordingly.
|
||||
bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
|
||||
}
|
||||
|
||||
void bli_trmm_prune_unref_mparts_n( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of B (that would
|
||||
// be encountered from partitioning in the n dimension) and adjust the
|
||||
// subpartition of C accordingly.
|
||||
bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
|
||||
}
|
||||
|
||||
void bli_trmm_prune_unref_mparts_k( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of A (that would
|
||||
// be encountered from partitioning in the k dimension) and adjust the
|
||||
// subpartition of B accordingly.
|
||||
bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
|
||||
|
||||
// Prune any unreferenced part from the subpartition of B (that would
|
||||
// be encountered from partitioning in the k dimension) and adjust the
|
||||
// subpartition of A accordingly.
|
||||
bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
|
||||
}
|
||||
|
||||
38
frame/3/trmm/bli_trmm_prune.h
Normal file
38
frame/3/trmm/bli_trmm_prune.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_trmm_prune_unref_mparts_m( obj_t* a, obj_t* b, obj_t* c );
|
||||
void bli_trmm_prune_unref_mparts_n( obj_t* a, obj_t* b, obj_t* c );
|
||||
void bli_trmm_prune_unref_mparts_k( obj_t* a, obj_t* b, obj_t* c );
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include "bli_trsm_check.h"
|
||||
#include "bli_trsm_front.h"
|
||||
#include "bli_trsm_int.h"
|
||||
#include "bli_trsm_prune.h"
|
||||
|
||||
#include "bli_gemmtrsm_ukernel.h"
|
||||
#include "bli_trsm_ukernel.h"
|
||||
|
||||
@@ -49,8 +49,9 @@ void bli_trsm_blk_var1b( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_m( a, b, c );
|
||||
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
@@ -71,28 +72,19 @@ void bli_trsm_blk_var1b( obj_t* a,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
offA = 0;
|
||||
|
||||
// If A is upper triangular, we have to adjust where the non-zero part of
|
||||
// A begins.
|
||||
if ( bli_obj_is_upper( *a ) )
|
||||
offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) -
|
||||
bli_obj_width_after_trans( *a );
|
||||
|
||||
dim_t start, end;
|
||||
dim_t my_start, my_end;
|
||||
num_t dt = bli_obj_execution_datatype( *a );
|
||||
bli_get_range_b2t( thread, offA, m_trans,
|
||||
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
|
||||
bli_info_get_default_mc( BLIS_TRSM, dt ),
|
||||
&start, &end );
|
||||
dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
|
||||
bli_info_get_default_mr( BLIS_TRSM, dt ) :
|
||||
bli_info_get_default_nr( BLIS_TRSM, dt ) );
|
||||
bli_get_range_b2t( thread, a, bf,
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the remaining portion of the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, end, a,
|
||||
b_alg = bli_determine_blocksize_b( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -49,8 +49,9 @@ void bli_trsm_blk_var1f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_m( a, b, c );
|
||||
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
@@ -71,27 +72,19 @@ void bli_trsm_blk_var1f( obj_t* a,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
offA = 0;
|
||||
|
||||
// If A is lower triangular, we have to adjust where the non-zero part of
|
||||
// A begins.
|
||||
if ( bli_obj_is_lower( *a ) )
|
||||
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
|
||||
|
||||
dim_t start, end;
|
||||
dim_t my_start, my_end;
|
||||
num_t dt = bli_obj_execution_datatype( *a );
|
||||
bli_get_range_t2b( thread, offA, m_trans,
|
||||
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
|
||||
bli_info_get_default_mc( BLIS_TRSM, dt ),
|
||||
&start, &end );
|
||||
dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
|
||||
bli_info_get_default_mr( BLIS_TRSM, dt ) :
|
||||
bli_info_get_default_nr( BLIS_TRSM, dt ) );
|
||||
bli_get_range_t2b( thread, a, bf,
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the remaining portion of the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
|
||||
@@ -50,7 +50,9 @@ void bli_trsm_blk_var2b( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
// Initialize pack objects for A that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
@@ -78,24 +80,21 @@ void bli_trsm_blk_var2b( obj_t* a,
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
dim_t my_start, my_end;
|
||||
num_t dt = bli_obj_execution_datatype( *a );
|
||||
bli_get_range_r2l( thread, 0, n_trans,
|
||||
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
|
||||
// bli_info_get_default_mr( BLIS_TRSM, dt ) ),
|
||||
bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
|
||||
bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
|
||||
&start, &end );
|
||||
dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
|
||||
bli_info_get_default_mr( BLIS_TRSM, dt ) :
|
||||
bli_info_get_default_nr( BLIS_TRSM, dt ) );
|
||||
bli_get_range_r2l( thread, b, bf,
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, end, b,
|
||||
b_alg = bli_determine_blocksize_b( i, my_end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
|
||||
@@ -50,7 +50,9 @@ void bli_trsm_blk_var2f( obj_t* a,
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
// Initialize pack objects for A that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
@@ -78,24 +80,21 @@ void bli_trsm_blk_var2f( obj_t* a,
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
dim_t my_start, my_end;
|
||||
num_t dt = bli_obj_execution_datatype( *a );
|
||||
bli_get_range_l2r( thread, 0, n_trans,
|
||||
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
|
||||
// bli_info_get_default_mr( BLIS_TRSM, dt ) ),
|
||||
bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
|
||||
bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
|
||||
&start, &end );
|
||||
dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
|
||||
bli_info_get_default_mr( BLIS_TRSM, dt ) :
|
||||
bli_info_get_default_nr( BLIS_TRSM, dt ) );
|
||||
bli_get_range_l2r( thread, b, bf,
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
for ( i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
|
||||
@@ -52,6 +52,9 @@ void bli_trsm_blk_var3b( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_k( a, b, c );
|
||||
|
||||
// Initialize pack objects for C that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
@@ -52,6 +52,9 @@ void bli_trsm_blk_var3f( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trsm_prune_unref_mparts_k( a, b, c );
|
||||
|
||||
// Initialize pack objects for C that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
71
frame/3/trsm/bli_trsm_prune.c
Normal file
71
frame/3/trsm/bli_trsm_prune.c
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_trsm_prune_unref_mparts_m( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of A (that would
|
||||
// be encountered from partitioning in the m dimension) and adjust the
|
||||
// subpartition of C accordingly.
|
||||
bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
|
||||
}
|
||||
|
||||
void bli_trsm_prune_unref_mparts_n( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of B (that would
|
||||
// be encountered from partitioning in the n dimension) and adjust the
|
||||
// subpartition of C accordingly.
|
||||
bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
|
||||
}
|
||||
|
||||
void bli_trsm_prune_unref_mparts_k( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c )
|
||||
{
|
||||
// Prune any unreferenced part from the subpartition of A (that would
|
||||
// be encountered from partitioning in the k dimension) and adjust the
|
||||
// subpartition of B accordingly.
|
||||
bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
|
||||
|
||||
// Prune any unreferenced part from the subpartition of B (that would
|
||||
// be encountered from partitioning in the k dimension) and adjust the
|
||||
// subpartition of A accordingly.
|
||||
bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
|
||||
}
|
||||
|
||||
38
frame/3/trsm/bli_trsm_prune.h
Normal file
38
frame/3/trsm/bli_trsm_prune.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_trsm_prune_unref_mparts_m( obj_t* a, obj_t* b, obj_t* c );
|
||||
void bli_trsm_prune_unref_mparts_n( obj_t* a, obj_t* b, obj_t* c );
|
||||
void bli_trsm_prune_unref_mparts_k( obj_t* a, obj_t* b, obj_t* c );
|
||||
|
||||
@@ -509,8 +509,8 @@ void bli_obj_print( char* label, obj_t* obj )
|
||||
( unsigned long int )bli_obj_width( *obj ) );
|
||||
fprintf( file, "\n" );
|
||||
|
||||
fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_offset( *obj ),
|
||||
( unsigned long int )bli_obj_col_offset( *obj ) );
|
||||
fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ),
|
||||
( unsigned long int )bli_obj_col_off( *obj ) );
|
||||
fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) );
|
||||
fprintf( file, "\n" );
|
||||
|
||||
|
||||
@@ -592,6 +592,16 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part,
|
||||
requested_part != BLIS_SUBPART11 &&
|
||||
requested_part != BLIS_SUBPART22 )
|
||||
{
|
||||
// FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal
|
||||
// intersecting subpartitions should inherit their root object's
|
||||
// uplo field, or it should not. Right now, they DO inherit the
|
||||
// uplo (because they are not set to BLIS_DENSE when the diagonal
|
||||
// does not intersect). But the whole point of being able to query
|
||||
// the root object's properties (e.g. uplo field) was so that we
|
||||
// COULD mark such subpartitions as dense, to make it easier for
|
||||
// certain subproblems on those subpartitions--subproblems that
|
||||
// are agnostic to where the subpartition came from.
|
||||
|
||||
// NOTE: This comment may be out-of-date since we now distinguish
|
||||
// between uplo properties for the current and root objects...
|
||||
// Note that we cannot mark the subpartition object as general/dense
|
||||
|
||||
135
frame/base/bli_prune.c
Normal file
135
frame/base/bli_prune.c
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
|
||||
obj_t* s, mdim_t mdim_s )
|
||||
{
|
||||
// If the primary object is general, it has no structure, and
|
||||
// therefore, no unreferenced parts.
|
||||
if ( bli_obj_is_general( *p ) ) return;
|
||||
|
||||
// If the primary object is BLIS_ZEROS, set the dimensions so that the
|
||||
// matrix is empty. This is not strictly needed but rather a minor
|
||||
// optimization, as it would prevent threads that would otherwise get
|
||||
// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
|
||||
// because bli_get_range*() would return empty ranges, which would
|
||||
// cause the variant's for loop from executing any iterations.
|
||||
// NOTE: this should only ever execute if the primary object is
|
||||
// triangular because that is the only structure type with subpartitions
|
||||
// that can be marked as BLIS_ZEROS.
|
||||
if ( bli_obj_is_triangular( *p ) &&
|
||||
bli_obj_is_zeros( *p ) ) { bli_obj_set_dim( mdim_p, 0, *p );
|
||||
bli_obj_set_dim( mdim_s, 0, *s );
|
||||
return; }
|
||||
|
||||
// If the primary object is hermitian, symmetric, or triangular, we
|
||||
// assume that the unstored region will be unreferenced (otherwise,
|
||||
// the caller should not be invoking this function on that object).
|
||||
//if ( bli_obj_is_herm_or_symm( *p ) ||
|
||||
// bli_obj_is_triangular( *p ) )
|
||||
{
|
||||
doff_t diagoff_p = bli_obj_diag_offset( *p );
|
||||
dim_t m = bli_obj_length( *p );
|
||||
dim_t n = bli_obj_width( *p );
|
||||
uplo_t uplo = bli_obj_uplo( *p );
|
||||
dim_t off_inc = 0;
|
||||
dim_t q;
|
||||
|
||||
// Support implicit transposition on p and s.
|
||||
if ( bli_obj_has_trans( *p ) )
|
||||
{
|
||||
bli_reflect_about_diag( diagoff_p, uplo, m, n );
|
||||
bli_toggle_dim( mdim_p );
|
||||
}
|
||||
if ( bli_obj_has_trans( *s ) )
|
||||
{
|
||||
bli_toggle_dim( mdim_s );
|
||||
}
|
||||
|
||||
// Prune away any zero region of the matrix depending on the
|
||||
// dimension of the primary object being partitioned and the
|
||||
// triangle in which it is stored.
|
||||
if ( bli_obj_is_lower( *p ) )
|
||||
{
|
||||
if ( bli_is_m_dim( mdim_p ) )
|
||||
{ bli_prune_unstored_region_top_l( diagoff_p, m, n, off_inc ); }
|
||||
else // if ( bli_is_n_dim( mdim_p ) )
|
||||
{ bli_prune_unstored_region_right_l( diagoff_p, m, n, off_inc ); }
|
||||
}
|
||||
else if ( bli_obj_is_upper( *p ) )
|
||||
{
|
||||
if ( bli_is_m_dim( mdim_p ) )
|
||||
{ bli_prune_unstored_region_bottom_u( diagoff_p, m, n, off_inc ); }
|
||||
else // if ( bli_is_n_dim( mdim_p ) )
|
||||
{ bli_prune_unstored_region_left_u( diagoff_p, m, n, off_inc ); }
|
||||
}
|
||||
else if ( bli_obj_is_dense( *p ) )
|
||||
{
|
||||
// Hermitian, symmetric, and triangular matrices are almost
|
||||
// never dense, but if one were found to be dense, it would
|
||||
// have no unreferenced regions to prune.
|
||||
return;
|
||||
}
|
||||
else // if ( bli_obj_is_zeros( *p ) )
|
||||
{
|
||||
// Sanity check. Hermitian/symmetric matrices should never have
|
||||
// zero subpartitions.
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
// Select the (potentially modified) dimension along which we are
|
||||
// partitioning.
|
||||
if ( bli_is_m_dim( mdim_p ) ) q = m;
|
||||
else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n;
|
||||
|
||||
// Update the affected objects in case anything changed. Notice that
|
||||
// it is okay to update the dimension and diagonal offset fields of
|
||||
// packed primary objects, as long as we do so in tandem with the
|
||||
// secondary object to maintain conformality. This just means that
|
||||
// the "ignore-able" zero region is skipped over here, rather than
|
||||
// within the macro-kernel.
|
||||
bli_obj_set_diag_offset( diagoff_p, *p );
|
||||
bli_obj_set_dim( mdim_p, q, *p );
|
||||
bli_obj_set_dim( mdim_s, q, *s );
|
||||
|
||||
// Only update the affected offset fields if the object in question
|
||||
// is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will
|
||||
// compute the wrong address within the macro-kernel object wrapper.
|
||||
if ( !bli_obj_is_packed( *p ) ) { bli_obj_inc_off( mdim_p, off_inc, *p ); }
|
||||
if ( !bli_obj_is_packed( *s ) ) { bli_obj_inc_off( mdim_s, off_inc, *s ); }
|
||||
}
|
||||
}
|
||||
|
||||
36
frame/base/bli_prune.h
Normal file
36
frame/base/bli_prune.h
Normal file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
|
||||
obj_t* s, mdim_t mdim_s );
|
||||
@@ -157,16 +157,19 @@ void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_s
|
||||
}
|
||||
|
||||
// Code for work assignments
|
||||
void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
|
||||
void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* start, dim_t* end )
|
||||
{
|
||||
thrinfo_t* thread = ( thrinfo_t* )thr;
|
||||
dim_t n_way = thread->n_way;
|
||||
dim_t work_id = thread->work_id;
|
||||
|
||||
dim_t all_start = 0;
|
||||
dim_t all_end = n;
|
||||
|
||||
dim_t size = all_end - all_start;
|
||||
|
||||
dim_t n_bf_whole = size / block_factor;
|
||||
dim_t n_bf_left = size % block_factor;
|
||||
dim_t n_bf_whole = size / bf;
|
||||
dim_t n_bf_left = size % bf;
|
||||
|
||||
dim_t n_bf_lo = n_bf_whole / n_way;
|
||||
dim_t n_bf_hi = n_bf_whole / n_way;
|
||||
@@ -217,8 +220,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
|
||||
|
||||
// Compute the actual widths (in units of rows/columns) of
|
||||
// individual threads in the low and high groups.
|
||||
dim_t size_lo = n_bf_lo * block_factor;
|
||||
dim_t size_hi = n_bf_hi * block_factor;
|
||||
dim_t size_lo = n_bf_lo * bf;
|
||||
dim_t size_hi = n_bf_hi * bf;
|
||||
|
||||
// Precompute the starting indices of the low and high groups.
|
||||
dim_t lo_start = all_start;
|
||||
@@ -257,8 +260,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
|
||||
|
||||
// Compute the actual widths (in units of rows/columns) of
|
||||
// individual threads in the low and high groups.
|
||||
dim_t size_lo = n_bf_lo * block_factor;
|
||||
dim_t size_hi = n_bf_hi * block_factor;
|
||||
dim_t size_lo = n_bf_lo * bf;
|
||||
dim_t size_hi = n_bf_hi * bf;
|
||||
|
||||
// Precompute the starting indices of the low and high groups.
|
||||
dim_t lo_start = all_start;
|
||||
@@ -288,188 +291,514 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
|
||||
}
|
||||
}
|
||||
|
||||
void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
bli_get_range( thr, all_start, all_end, block_factor,
|
||||
dim_t m = bli_obj_length_after_trans( *a );
|
||||
dim_t n = bli_obj_width_after_trans( *a );
|
||||
|
||||
bli_get_range( thr, n, bf,
|
||||
FALSE, start, end );
|
||||
|
||||
return m * ( *end - *start );
|
||||
}
|
||||
|
||||
void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
bli_get_range( thr, all_start, all_end, block_factor,
|
||||
dim_t m = bli_obj_length_after_trans( *a );
|
||||
dim_t n = bli_obj_width_after_trans( *a );
|
||||
|
||||
bli_get_range( thr, n, bf,
|
||||
TRUE, start, end );
|
||||
|
||||
return m * ( *end - *start );
|
||||
}
|
||||
|
||||
void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
bli_get_range( thr, all_start, all_end, block_factor,
|
||||
dim_t m = bli_obj_length_after_trans( *a );
|
||||
dim_t n = bli_obj_width_after_trans( *a );
|
||||
|
||||
bli_get_range( thr, m, bf,
|
||||
FALSE, start, end );
|
||||
|
||||
return n * ( *end - *start );
|
||||
}
|
||||
|
||||
void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
bli_get_range( thr, all_start, all_end, block_factor,
|
||||
dim_t m = bli_obj_length_after_trans( *a );
|
||||
dim_t n = bli_obj_width_after_trans( *a );
|
||||
|
||||
bli_get_range( thr, m, bf,
|
||||
TRUE, start, end );
|
||||
|
||||
return n * ( *end - *start );
|
||||
}
|
||||
|
||||
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
|
||||
dim_t bli_get_range_width_l( doff_t diagoff_j,
|
||||
dim_t m,
|
||||
dim_t n_j,
|
||||
dim_t j,
|
||||
dim_t n_way,
|
||||
dim_t bf,
|
||||
dim_t bf_left,
|
||||
double area_per_thr,
|
||||
bool_t handle_edge_low )
|
||||
{
|
||||
dim_t width;
|
||||
|
||||
// In this function, we assume that we are somewhere in the process of
|
||||
// partitioning an m x n lower-stored region (with arbitrary diagonal
|
||||
// offset) n_ways along the n dimension (into column panels). The value
|
||||
// j identifies the left-to-right subpartition index (from 0 to n_way-1)
|
||||
// of the subpartition whose width we are about to compute using the
|
||||
// area per thread determined by the caller. n_j is the number of
|
||||
// columns in the remaining region of the matrix being partitioned,
|
||||
// and diagoff_j is that region's diagonal offset.
|
||||
|
||||
// If this is the last subpartition, the width is simply equal to n_j.
|
||||
// Note that this statement handles cases where the "edge case" (if
|
||||
// one exists) is assigned to the high end of the index range (ie:
|
||||
// handle_edge_low == FALSE).
|
||||
if ( j == n_way - 1 ) return n_j;
|
||||
|
||||
// At this point, we know there are at least two subpartitions left.
|
||||
// We also know that IF the submatrix contains a completely dense
|
||||
// rectangular submatrix, it will occur BEFORE the triangular (or
|
||||
// trapezoidal) part.
|
||||
|
||||
// Here, we implement a somewhat minor load balancing optimization
|
||||
// that ends up getting employed only for relatively small matrices.
|
||||
// First, recall that all subpartition widths will be some multiple
|
||||
// of the blocking factor bf, except perhaps either the first or last
|
||||
// subpartition, which will receive the edge case, if it exists.
|
||||
// Also recall that j represents the current thread (or thread group,
|
||||
// or "caucus") for which we are computing a subpartition width.
|
||||
// If n_j is sufficiently small that we can only allocate bf columns
|
||||
// to each of the remaining threads, then we set the width to bf. We
|
||||
// do not allow the subpartition width to be less than bf, so, under
|
||||
// some conditions, if n_j is small enough, some of the reamining
|
||||
// threads may not get any work. For the purposes of this lower bound
|
||||
// on work (ie: width >= bf), we allow the edge case to count as a
|
||||
// "full" set of bf columns.
|
||||
{
|
||||
dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 );
|
||||
|
||||
if ( n_j_bf <= n_way - j )
|
||||
{
|
||||
if ( j == 0 && handle_edge_low )
|
||||
width = ( bf_left > 0 ? bf_left : bf );
|
||||
else
|
||||
width = bf;
|
||||
|
||||
// Make sure that the width does not exceed n_j. This would
|
||||
// occur if and when n_j_bf < n_way - j; that is, when the
|
||||
// matrix being partitioned is sufficiently small relative to
|
||||
// n_way such that there is not even enough work for every
|
||||
// (remaining) thread to get bf (or bf_left) columns. The
|
||||
// net effect of this safeguard is that some threads may get
|
||||
// assigned empty ranges (ie: no work), which of course must
|
||||
// happen in some situations.
|
||||
if ( width > n_j ) width = n_j;
|
||||
|
||||
return width;
|
||||
}
|
||||
}
|
||||
|
||||
// This block computes the width assuming that we are entirely within
|
||||
// a dense rectangle that precedes the triangular (or trapezoidal)
|
||||
// part.
|
||||
{
|
||||
// First compute the width of the current panel under the
|
||||
// assumption that the diagonal offset would not intersect.
|
||||
width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m );
|
||||
|
||||
// Adjust the width, if necessary. Specifically, we may need
|
||||
// to allocate the edge case to the first subpartition, if
|
||||
// requested; otherwise, we just need to ensure that the
|
||||
// subpartition is a multiple of the blocking factor.
|
||||
if ( j == 0 && handle_edge_low )
|
||||
{
|
||||
if ( width % bf != bf_left ) width += bf_left - ( width % bf );
|
||||
}
|
||||
else // if interior case
|
||||
{
|
||||
// Round up to the next multiple of the blocking factor.
|
||||
//if ( width % bf != 0 ) width += bf - ( width % bf );
|
||||
// Round to the nearest multiple of the blocking factor.
|
||||
if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
|
||||
}
|
||||
}
|
||||
|
||||
// We need to recompute width if the panel, according to the width
|
||||
// as currently computed, would intersect the diagonal.
|
||||
if ( diagoff_j < width )
|
||||
{
|
||||
dim_t offm_inc, offn_inc;
|
||||
|
||||
// Prune away the unstored region above the diagonal, if it exists.
|
||||
// Note that the entire region was pruned initially, so we know that
|
||||
// we don't need to try to prune the right side. (Also, we discard
|
||||
// the offset deltas since we don't need to actually index into the
|
||||
// subpartition.)
|
||||
bli_prune_unstored_region_top_l( diagoff_j, m, n_j, offm_inc );
|
||||
//bli_prune_unstored_region_right_l( diagoff_j, m, n_j, offn_inc );
|
||||
|
||||
// We don't need offm_inc, offn_inc here. These statements should
|
||||
// prevent compiler warnings.
|
||||
( void )offm_inc;
|
||||
( void )offn_inc;
|
||||
|
||||
// Solve a quadratic equation to find the width of the current (jth)
|
||||
// subpartition given the m dimension, diagonal offset, and area.
|
||||
// NOTE: We know that the +/- in the quadratic formula must be a +
|
||||
// here because we know that the desired solution (the subpartition
|
||||
// width) will be smaller than (m + diagoff), not larger. If you
|
||||
// don't believe me, draw a picture!
|
||||
const double a = -0.5;
|
||||
const double b = ( double )m + ( double )diagoff_j + 0.5;
|
||||
const double c = -0.5 * ( ( double )diagoff_j *
|
||||
( ( double )diagoff_j + 1.0 )
|
||||
) - area_per_thr;
|
||||
const double x = ( -b + sqrt( b * b - 4.0 * a * c ) ) / ( 2.0 * a );
|
||||
|
||||
// Use the rounded solution as our width, but make sure it didn't
|
||||
// round to zero.
|
||||
width = ( dim_t )bli_round( x );
|
||||
if ( width == 0 ) width = 1;
|
||||
|
||||
// Adjust the width, if necessary.
|
||||
if ( j == 0 && handle_edge_low )
|
||||
{
|
||||
if ( width % bf != bf_left ) width += bf_left - ( width % bf );
|
||||
}
|
||||
else // if interior case
|
||||
{
|
||||
// Round up to the next multiple of the blocking factor.
|
||||
//if ( width % bf != 0 ) width += bf - ( width % bf );
|
||||
// Round to the nearest multiple of the blocking factor.
|
||||
if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure that the width, after being adjusted, does not cause the
|
||||
// subpartition to exceed n_j.
|
||||
if ( width > n_j ) width = n_j;
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff )
|
||||
{
|
||||
dim_t offm_inc = 0;
|
||||
dim_t offn_inc = 0;
|
||||
double tri_area;
|
||||
double area;
|
||||
|
||||
// Prune away any rectangular region above where the diagonal
|
||||
// intersects the left edge of the subpartition, if it exists.
|
||||
bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
|
||||
|
||||
// Prune away any rectangular region to the right of where the
|
||||
// diagonal intersects the bottom edge of the subpartition, if
|
||||
// it exists. (This shouldn't ever be needed, since the caller
|
||||
// would presumably have already performed rightward pruning,
|
||||
// but it's here just in case.)
|
||||
bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
|
||||
|
||||
( void )offm_inc;
|
||||
( void )offn_inc;
|
||||
|
||||
// Compute the area of the empty triangle so we can subtract it
|
||||
// from the area of the rectangle that bounds the subpartition.
|
||||
if ( bli_intersects_diag_n( diagoff, m, n ) )
|
||||
{
|
||||
double tri_dim = ( double )( n - diagoff - 1 );
|
||||
tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the diagonal does not intersect the trapezoid, then
|
||||
// we can compute the area as a simple rectangle.
|
||||
tri_area = 0.0;
|
||||
}
|
||||
|
||||
area = ( double )m * ( double )n - tri_area;
|
||||
|
||||
return ( siz_t )area;
|
||||
}
|
||||
|
||||
siz_t bli_get_range_weighted( void* thr,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* j_start_thr,
|
||||
dim_t* j_end_thr )
|
||||
{
|
||||
thrinfo_t* thread = ( thrinfo_t* )thr;
|
||||
dim_t n_way = thread->n_way;
|
||||
dim_t work_id = thread->work_id;
|
||||
dim_t size = all_end - all_start;
|
||||
dim_t width;
|
||||
dim_t block_fac_leftover = size % block_factor;
|
||||
dim_t i;
|
||||
double num;
|
||||
|
||||
*start = 0;
|
||||
*end = all_end - all_start;
|
||||
num = size * size / ( double )n_way;
|
||||
dim_t n_way = thread->n_way;
|
||||
dim_t my_id = thread->work_id;
|
||||
|
||||
dim_t bf_left = n % bf;
|
||||
|
||||
dim_t j;
|
||||
|
||||
dim_t off_j;
|
||||
doff_t diagoff_j;
|
||||
dim_t n_left;
|
||||
|
||||
dim_t width_j;
|
||||
|
||||
dim_t offm_inc, offn_inc;
|
||||
|
||||
double tri_dim, tri_area;
|
||||
double area_total, area_per_thr;
|
||||
|
||||
siz_t area = 0;
|
||||
|
||||
// In this function, we assume that the caller has already determined
|
||||
// that (a) the diagonal intersects the submatrix, and (b) the submatrix
|
||||
// is either lower- or upper-stored.
|
||||
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
dim_t cur_caucus = n_way - 1;
|
||||
dim_t len = 0;
|
||||
// Prune away the unstored region above the diagonal, if it exists,
|
||||
// and then to the right of where the diagonal intersects the bottom,
|
||||
// if it exists. (Also, we discard the offset deltas since we don't
|
||||
// need to actually index into the subpartition.)
|
||||
bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
|
||||
bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
|
||||
|
||||
// This loop computes subpartitions backwards, from the high end
|
||||
// of the index range to the low end. If the low end is assumed
|
||||
// to be on the left and the high end the right, this assignment
|
||||
// of widths is appropriate for n dimension partitioning of a
|
||||
// lower triangular matrix.
|
||||
for ( i = 0; TRUE; ++i )
|
||||
// We don't need offm_inc, offn_inc here. These statements should
|
||||
// prevent compiler warnings.
|
||||
( void )offm_inc;
|
||||
( void )offn_inc;
|
||||
|
||||
// Now that pruning has taken place, we know that diagoff >= 0.
|
||||
|
||||
// Compute the total area of the submatrix, accounting for the
|
||||
// location of the diagonal, and divide it by the number of ways
|
||||
// of parallelism.
|
||||
tri_dim = ( double )( n - diagoff - 1 );
|
||||
tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
|
||||
area_total = ( double )m * ( double )n - tri_area;
|
||||
area_per_thr = area_total / ( double )n_way;
|
||||
|
||||
// Initialize some variables prior to the loop: the offset to the
|
||||
// current subpartition, the remainder of the n dimension, and
|
||||
// the diagonal offset of the current subpartition.
|
||||
off_j = 0;
|
||||
diagoff_j = diagoff;
|
||||
n_left = n;
|
||||
|
||||
// Iterate over the subpartition indices corresponding to each
|
||||
// thread/caucus participating in the n_way parallelism.
|
||||
for ( j = 0; j < n_way; ++j )
|
||||
{
|
||||
width = ceil( sqrt( len*len + num ) ) - len;
|
||||
// Compute the width of the jth subpartition, taking the
|
||||
// current diagonal offset into account, if needed.
|
||||
width_j = bli_get_range_width_l( diagoff_j, m, n_left,
|
||||
j, n_way,
|
||||
bf, bf_left,
|
||||
area_per_thr,
|
||||
handle_edge_low );
|
||||
|
||||
// If we need to allocate the edge case (assuming it exists)
|
||||
// to the high thread subpartition, adjust width so that it
|
||||
// contains the exact amount of leftover edge dimension so that
|
||||
// all remaining subpartitions can be multiples of block_factor.
|
||||
// If the edge case is to be allocated to the low subpartition,
|
||||
// or if there is no edge case, it is implicitly allocated to
|
||||
// the low subpartition by virtue of the fact that all other
|
||||
// subpartitions already assigned will be multiples of
|
||||
// block_factor.
|
||||
if ( i == 0 && !handle_edge_low )
|
||||
// If the current thread belongs to caucus j, this is his
|
||||
// subpartition. So we compute the implied index range and
|
||||
// end our search.
|
||||
if ( j == my_id )
|
||||
{
|
||||
if ( width % block_factor != block_fac_leftover )
|
||||
width += block_fac_leftover - ( width % block_factor );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( width % block_factor != 0 )
|
||||
width += block_factor - ( width % block_factor );
|
||||
*j_start_thr = off_j;
|
||||
*j_end_thr = off_j + width_j;
|
||||
|
||||
area = bli_find_area_trap_l( m, width_j, diagoff_j );
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if ( cur_caucus == work_id )
|
||||
{
|
||||
*start = bli_max( 0, *end - width ) + all_start;
|
||||
*end = *end + all_start;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*end -= width;
|
||||
len += width;
|
||||
cur_caucus--;
|
||||
}
|
||||
// Shift the current subpartition's starting and diagonal offsets,
|
||||
// as well as the remainder of the n dimension, according to the
|
||||
// computed width, and then iterate to the next subpartition.
|
||||
off_j += width_j;
|
||||
diagoff_j -= width_j;
|
||||
n_left -= width_j;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_upper( uplo ) )
|
||||
{
|
||||
// This loop computes subpartitions forwards, from the low end
|
||||
// of the index range to the high end. If the low end is assumed
|
||||
// to be on the left and the high end the right, this assignment
|
||||
// of widths is appropriate for n dimension partitioning of an
|
||||
// upper triangular matrix.
|
||||
for ( i = 0; TRUE; ++i )
|
||||
// Express the upper-stored case in terms of the lower-stored case.
|
||||
|
||||
// First, we convert the upper-stored trapezoid to an equivalent
|
||||
// lower-stored trapezoid by rotating it 180 degrees.
|
||||
bli_rotate180_trapezoid( diagoff, uplo );
|
||||
|
||||
// Now that the trapezoid is "flipped" in the n dimension, negate
|
||||
// the bool that encodes whether to handle the edge case at the
|
||||
// low (or high) end of the index range.
|
||||
bli_toggle_bool( handle_edge_low );
|
||||
|
||||
// Compute the appropriate range for the rotated trapezoid.
|
||||
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
|
||||
handle_edge_low,
|
||||
j_start_thr, j_end_thr );
|
||||
|
||||
// Reverse the indexing basis for the subpartition ranges so that
|
||||
// the indices, relative to left-to-right iteration through the
|
||||
// unrotated upper-stored trapezoid, map to the correct columns
|
||||
// (relative to the diagonal). This amounts to subtracting the
|
||||
// range from n.
|
||||
bli_reverse_index_direction( *j_start_thr, *j_end_thr, n );
|
||||
}
|
||||
|
||||
return area;
|
||||
}
|
||||
|
||||
siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
siz_t area;
|
||||
|
||||
// This function assigns area-weighted ranges in the n dimension
|
||||
// where the total range spans 0 to n-1 with 0 at the left end and
|
||||
// n-1 at the right end.
|
||||
|
||||
if ( bli_obj_intersects_diag( *a ) &&
|
||||
bli_obj_is_upper_or_lower( *a ) )
|
||||
{
|
||||
doff_t diagoff = bli_obj_diag_offset( *a );
|
||||
uplo_t uplo = bli_obj_uplo( *a );
|
||||
dim_t m = bli_obj_length( *a );
|
||||
dim_t n = bli_obj_width( *a );
|
||||
|
||||
// Support implicit transposition.
|
||||
if ( bli_obj_has_trans( *a ) )
|
||||
{
|
||||
width = ceil( sqrt( *start * *start + num ) ) - *start;
|
||||
|
||||
if ( i == 0 && handle_edge_low )
|
||||
{
|
||||
if ( width % block_factor != block_fac_leftover )
|
||||
width += block_fac_leftover - ( width % block_factor );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( width % block_factor != 0 )
|
||||
width += block_factor - ( width % block_factor );
|
||||
}
|
||||
|
||||
if ( work_id == 0 )
|
||||
{
|
||||
*start = *start + all_start;
|
||||
*end = bli_min( *start + width, all_end );
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
*start = *start + width;
|
||||
work_id--;
|
||||
}
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
|
||||
{
|
||||
if ( bli_is_upper_or_lower( uplo ) )
|
||||
{
|
||||
bli_get_range_weighted( thr, all_start, all_end, block_factor,
|
||||
uplo, FALSE, start, end );
|
||||
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
|
||||
FALSE, start, end );
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
bli_get_range_l2r( thr, all_start, all_end, block_factor,
|
||||
start, end );
|
||||
area = bli_get_range_l2r( thr, a, bf,
|
||||
start, end );
|
||||
}
|
||||
|
||||
return area;
|
||||
}
|
||||
|
||||
void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
if ( bli_is_upper_or_lower( uplo ) )
|
||||
siz_t area;
|
||||
|
||||
// This function assigns area-weighted ranges in the n dimension
|
||||
// where the total range spans 0 to n-1 with 0 at the right end and
|
||||
// n-1 at the left end.
|
||||
|
||||
if ( bli_obj_intersects_diag( *a ) &&
|
||||
bli_obj_is_upper_or_lower( *a ) )
|
||||
{
|
||||
//printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
|
||||
bli_toggle_uplo( uplo );
|
||||
bli_get_range_weighted( thr, all_start, all_end, block_factor,
|
||||
uplo, TRUE, start, end );
|
||||
doff_t diagoff = bli_obj_diag_offset( *a );
|
||||
uplo_t uplo = bli_obj_uplo( *a );
|
||||
dim_t m = bli_obj_length( *a );
|
||||
dim_t n = bli_obj_width( *a );
|
||||
|
||||
// Support implicit transposition.
|
||||
if ( bli_obj_has_trans( *a ) )
|
||||
{
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
}
|
||||
|
||||
bli_rotate180_trapezoid( diagoff, uplo );
|
||||
|
||||
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
|
||||
TRUE, start, end );
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
//printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
|
||||
bli_get_range_r2l( thr, all_start, all_end, block_factor,
|
||||
start, end );
|
||||
area = bli_get_range_r2l( thr, a, bf,
|
||||
start, end );
|
||||
}
|
||||
|
||||
return area;
|
||||
}
|
||||
|
||||
void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
if ( bli_is_upper_or_lower( uplo ) )
|
||||
siz_t area;
|
||||
|
||||
// This function assigns area-weighted ranges in the m dimension
|
||||
// where the total range spans 0 to m-1 with 0 at the top end and
|
||||
// m-1 at the bottom end.
|
||||
|
||||
if ( bli_obj_intersects_diag( *a ) &&
|
||||
bli_obj_is_upper_or_lower( *a ) )
|
||||
{
|
||||
bli_toggle_uplo( uplo );
|
||||
bli_get_range_weighted( thr, all_start, all_end, block_factor,
|
||||
uplo, FALSE, start, end );
|
||||
doff_t diagoff = bli_obj_diag_offset( *a );
|
||||
uplo_t uplo = bli_obj_uplo( *a );
|
||||
dim_t m = bli_obj_length( *a );
|
||||
dim_t n = bli_obj_width( *a );
|
||||
|
||||
// Support implicit transposition.
|
||||
if ( bli_obj_has_trans( *a ) )
|
||||
{
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
}
|
||||
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
|
||||
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
|
||||
FALSE, start, end );
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
bli_get_range_t2b( thr, all_start, all_end, block_factor,
|
||||
start, end );
|
||||
area = bli_get_range_t2b( thr, a, bf,
|
||||
start, end );
|
||||
}
|
||||
|
||||
return area;
|
||||
}
|
||||
|
||||
void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
|
||||
siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
|
||||
{
|
||||
if ( bli_is_upper_or_lower( uplo ) )
|
||||
siz_t area;
|
||||
|
||||
// This function assigns area-weighted ranges in the m dimension
|
||||
// where the total range spans 0 to m-1 with 0 at the bottom end and
|
||||
// m-1 at the top end.
|
||||
|
||||
if ( bli_obj_intersects_diag( *a ) &&
|
||||
bli_obj_is_upper_or_lower( *a ) )
|
||||
{
|
||||
bli_get_range_weighted( thr, all_start, all_end, block_factor,
|
||||
uplo, TRUE, start, end );
|
||||
doff_t diagoff = bli_obj_diag_offset( *a );
|
||||
uplo_t uplo = bli_obj_uplo( *a );
|
||||
dim_t m = bli_obj_length( *a );
|
||||
dim_t n = bli_obj_width( *a );
|
||||
|
||||
// Support implicit transposition.
|
||||
if ( bli_obj_has_trans( *a ) )
|
||||
{
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
}
|
||||
|
||||
bli_reflect_about_diag( diagoff, uplo, m, n );
|
||||
|
||||
bli_rotate180_trapezoid( diagoff, uplo );
|
||||
|
||||
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
|
||||
TRUE, start, end );
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
bli_get_range_b2t( thr, all_start, all_end, block_factor,
|
||||
start, end );
|
||||
area = bli_get_range_b2t( thr, a, bf,
|
||||
start, end );
|
||||
}
|
||||
|
||||
return area;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -101,13 +101,25 @@ void bli_barrier( thread_comm_t* communicator, dim_t thread_id );
|
||||
|
||||
struct thrinfo_s
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
// The thread communicator for the other threads sharing the same work
|
||||
// at this level.
|
||||
thread_comm_t* ocomm;
|
||||
|
||||
dim_t n_way; //Number of distinct used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
// Our thread id within the ocomm thread communicator.
|
||||
dim_t ocomm_id;
|
||||
|
||||
// The thread communicator for the other threads sharing the same work
|
||||
// at this level.
|
||||
thread_comm_t* icomm;
|
||||
|
||||
// Our thread id within the icomm thread communicator.
|
||||
dim_t icomm_id;
|
||||
|
||||
// The number of distinct threads used to parallelize the loop.
|
||||
dim_t n_way;
|
||||
|
||||
// What we're working on.
|
||||
dim_t work_id;
|
||||
};
|
||||
typedef struct thrinfo_s thrinfo_t;
|
||||
|
||||
@@ -128,39 +140,37 @@ typedef struct thrinfo_s thrinfo_t;
|
||||
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
|
||||
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
|
||||
|
||||
void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor,
|
||||
bool_t handle_edge_low,
|
||||
void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor,
|
||||
dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
|
||||
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor, uplo_t uplo,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor, uplo_t uplo,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor, uplo_t uplo,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor, uplo_t uplo,
|
||||
dim_t* start, dim_t* end );
|
||||
void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
|
||||
dim_t block_factor, uplo_t uplo,
|
||||
dim_t* start, dim_t* end );
|
||||
dim_t bli_get_range_width_l( doff_t diagoff_j,
|
||||
dim_t m,
|
||||
dim_t n_j,
|
||||
dim_t j,
|
||||
dim_t n_way,
|
||||
dim_t bf,
|
||||
dim_t bf_left,
|
||||
double area_per_thr,
|
||||
bool_t handle_edge_low );
|
||||
siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff );
|
||||
siz_t bli_get_range_weighted( void* thr,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* j_start_thr,
|
||||
dim_t* j_end_thr );
|
||||
|
||||
siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
|
||||
|
||||
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
|
||||
@@ -156,8 +156,8 @@
|
||||
|
||||
#define bli_obj_is_upper_or_lower( obj ) \
|
||||
\
|
||||
( ( (obj).info & BLIS_UPLO_BITS ) == BLIS_BITVAL_UPPER || \
|
||||
( (obj).info & BLIS_UPLO_BITS ) == BLIS_BITVAL_LOWER )
|
||||
( bli_obj_is_upper( obj ) || \
|
||||
bli_obj_is_lower( obj ) )
|
||||
|
||||
#define bli_obj_is_dense( obj ) \
|
||||
\
|
||||
@@ -441,11 +441,15 @@
|
||||
|
||||
#define bli_obj_length( obj ) \
|
||||
\
|
||||
((obj).m)
|
||||
( (obj).dim[BLIS_M] )
|
||||
|
||||
#define bli_obj_width( obj ) \
|
||||
\
|
||||
((obj).n)
|
||||
( (obj).dim[BLIS_N] )
|
||||
|
||||
#define bli_obj_dim( mdim, obj ) \
|
||||
\
|
||||
( (obj).dim[mdim] )
|
||||
|
||||
#define bli_obj_min_dim( obj ) \
|
||||
\
|
||||
@@ -579,23 +583,38 @@ bli_obj_width_stored( obj )
|
||||
|
||||
// Dimension modification
|
||||
|
||||
#define bli_obj_set_length( dim_m, obj ) \
|
||||
{ \
|
||||
(obj).dim[BLIS_M] = dim_m; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_width( dim_n, obj ) \
|
||||
{ \
|
||||
(obj).dim[BLIS_N] = dim_n; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_dim( mdim, dim_val, obj ) \
|
||||
{ \
|
||||
(obj).dim[mdim] = dim_val; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_dims( dim_m, dim_n, obj ) \
|
||||
{ \
|
||||
(obj).m = dim_m; \
|
||||
(obj).n = dim_n; \
|
||||
bli_obj_set_length( dim_m, obj ); \
|
||||
bli_obj_set_width( dim_n, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_set_dims_with_trans( trans, dim_m, dim_n, obj ) \
|
||||
{ \
|
||||
if ( bli_does_notrans( trans ) ) \
|
||||
{ \
|
||||
(obj).m = dim_m; \
|
||||
(obj).n = dim_n; \
|
||||
bli_obj_set_length( dim_m, obj ); \
|
||||
bli_obj_set_width( dim_n, obj ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
(obj).m = dim_n; \
|
||||
(obj).n = dim_m; \
|
||||
bli_obj_set_length( dim_n, obj ); \
|
||||
bli_obj_set_width( dim_m, obj ); \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -604,15 +623,15 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_row_stride( obj ) \
|
||||
\
|
||||
((obj).rs)
|
||||
( (obj).rs )
|
||||
|
||||
#define bli_obj_col_stride( obj ) \
|
||||
\
|
||||
((obj).cs)
|
||||
( (obj).cs )
|
||||
|
||||
#define bli_obj_imag_stride( obj ) \
|
||||
\
|
||||
((obj).is)
|
||||
( (obj).is )
|
||||
|
||||
#define bli_obj_row_stride_mag( obj ) \
|
||||
\
|
||||
@@ -671,41 +690,60 @@ bli_obj_width_stored( obj )
|
||||
|
||||
// Offset query
|
||||
|
||||
#define bli_obj_row_offset( obj ) \
|
||||
#define bli_obj_row_off( obj ) \
|
||||
\
|
||||
( (obj).offm )
|
||||
( (obj).off[BLIS_M] )
|
||||
|
||||
#define bli_obj_col_offset( obj ) \
|
||||
#define bli_obj_col_off( obj ) \
|
||||
\
|
||||
( (obj).offn )
|
||||
( (obj).off[BLIS_N] )
|
||||
|
||||
#define bli_obj_off( mdim, obj ) \
|
||||
\
|
||||
( (obj).off[mdim] )
|
||||
|
||||
|
||||
// Offset modification
|
||||
|
||||
#define bli_obj_set_off( mdim, offset, obj ) \
|
||||
{ \
|
||||
(obj).off[mdim] = offset; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_offs( offset_m, offset_n, obj ) \
|
||||
{ \
|
||||
(obj).offm = offset_m; \
|
||||
(obj).offn = offset_n; \
|
||||
bli_obj_set_off( BLIS_M, offset_m, obj ); \
|
||||
bli_obj_set_off( BLIS_N, offset_n, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_inc_off( mdim, offset, obj ) \
|
||||
{ \
|
||||
(obj).off[mdim] += offset; \
|
||||
}
|
||||
|
||||
#define bli_obj_inc_offm( offset, obj ) \
|
||||
{ \
|
||||
bli_obj_inc_off( BLIS_M, offset, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_inc_offn( offset, obj ) \
|
||||
{ \
|
||||
bli_obj_inc_off( BLIS_N, offset, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_inc_offs( offset_m, offset_n, obj ) \
|
||||
{ \
|
||||
(obj).offm += offset_m; \
|
||||
(obj).offn += offset_n; \
|
||||
bli_obj_inc_off( BLIS_M, offset_m, obj ); \
|
||||
bli_obj_inc_off( BLIS_N, offset_n, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_dec_offs( offset_m, offset_n, obj ) \
|
||||
{ \
|
||||
(obj).offm -= offset_m; \
|
||||
(obj).offn -= offset_n; \
|
||||
}
|
||||
|
||||
|
||||
// Diagonal offset query
|
||||
|
||||
#define bli_obj_diag_offset( obj ) \
|
||||
\
|
||||
((obj).diag_off)
|
||||
( (obj).diag_off )
|
||||
|
||||
#define bli_obj_diag_offset_after_trans( obj ) \
|
||||
\
|
||||
@@ -762,7 +800,7 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_buffer( obj ) \
|
||||
\
|
||||
(obj).buffer
|
||||
( (obj).buffer )
|
||||
|
||||
// Buffer address modification
|
||||
|
||||
@@ -776,7 +814,7 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_internal_scalar_buffer( obj ) \
|
||||
\
|
||||
&((obj).scalar)
|
||||
&( (obj).scalar )
|
||||
|
||||
// Bufferless scalar field modification
|
||||
|
||||
@@ -794,7 +832,7 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_elem_size( obj ) \
|
||||
\
|
||||
(obj).elem_size \
|
||||
( (obj).elem_size )
|
||||
|
||||
// Element size modification
|
||||
|
||||
@@ -851,19 +889,19 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_panel_length( obj ) \
|
||||
\
|
||||
((obj).m_panel)
|
||||
( (obj).m_panel )
|
||||
|
||||
#define bli_obj_panel_width( obj ) \
|
||||
\
|
||||
((obj).n_panel)
|
||||
( (obj).n_panel )
|
||||
|
||||
#define bli_obj_panel_dim( obj ) \
|
||||
\
|
||||
((obj).pd)
|
||||
( (obj).pd )
|
||||
|
||||
#define bli_obj_panel_stride( obj ) \
|
||||
\
|
||||
((obj).ps)
|
||||
( (obj).ps )
|
||||
|
||||
// Packed panel info modification
|
||||
|
||||
@@ -969,15 +1007,19 @@ bli_obj_width_stored( obj )
|
||||
#define bli_obj_buffer_for_const( dt, obj ) \
|
||||
\
|
||||
( void* )( \
|
||||
( ( char* )( (obj).buffer ) ) + dt * BLIS_CONSTANT_SLOT_SIZE \
|
||||
( ( char* )( bli_obj_buffer( obj ) ) ) + \
|
||||
( dim_t )( dt * BLIS_CONSTANT_SLOT_SIZE ) \
|
||||
)
|
||||
|
||||
#define bli_obj_buffer_at_off( obj ) \
|
||||
\
|
||||
( void* )( \
|
||||
( ( char* )( (obj).buffer ) ) + ( dim_t )(obj).elem_size * \
|
||||
( (obj).offn * (obj).cs + \
|
||||
(obj).offm * (obj).rs ) \
|
||||
( ( char* )( bli_obj_buffer ( obj ) ) + \
|
||||
( dim_t )( bli_obj_elem_size( obj ) ) * \
|
||||
( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + \
|
||||
bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) \
|
||||
) \
|
||||
) \
|
||||
)
|
||||
|
||||
#define bli_obj_buffer_for_1x1( dt, obj ) \
|
||||
@@ -1015,8 +1057,8 @@ bli_obj_width_stored( obj )
|
||||
dim_t n = bli_obj_width( obj ); \
|
||||
inc_t rs = bli_obj_row_stride( obj ); \
|
||||
inc_t cs = bli_obj_col_stride( obj ); \
|
||||
dim_t offm = bli_obj_row_offset( obj ); \
|
||||
dim_t offn = bli_obj_col_offset( obj ); \
|
||||
dim_t offm = bli_obj_row_off( obj ); \
|
||||
dim_t offn = bli_obj_col_off( obj ); \
|
||||
doff_t diag_off = bli_obj_diag_offset( obj ); \
|
||||
\
|
||||
bli_obj_set_dims( n, m, obj ); \
|
||||
@@ -1047,8 +1089,8 @@ bli_obj_width_stored( obj )
|
||||
{ \
|
||||
dim_t m = bli_obj_length( obj ); \
|
||||
dim_t n = bli_obj_width( obj ); \
|
||||
dim_t offm = bli_obj_row_offset( obj ); \
|
||||
dim_t offn = bli_obj_col_offset( obj ); \
|
||||
dim_t offm = bli_obj_row_off( obj ); \
|
||||
dim_t offn = bli_obj_col_off( obj ); \
|
||||
doff_t diag_off = bli_obj_diag_offset( obj ); \
|
||||
\
|
||||
bli_obj_set_dims( n, m, obj ); \
|
||||
|
||||
@@ -144,7 +144,8 @@
|
||||
|
||||
#define bli_is_upper_or_lower( uplo ) \
|
||||
\
|
||||
( bli_is_upper( uplo ) || bli_is_lower( uplo ) )
|
||||
( bli_is_upper( uplo ) || \
|
||||
bli_is_lower( uplo ) )
|
||||
|
||||
#define bli_is_dense( uplo ) \
|
||||
\
|
||||
@@ -470,6 +471,106 @@
|
||||
( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) )
|
||||
|
||||
|
||||
// pruning-related
|
||||
|
||||
#define bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ) \
|
||||
{ \
|
||||
offm_inc = 0; \
|
||||
\
|
||||
/* If the diagonal intersects the left side of the matrix,
|
||||
ignore the area above that intersection. */ \
|
||||
if ( diagoff < 0 ) \
|
||||
{ \
|
||||
m = m + diagoff; \
|
||||
offm_inc = - diagoff; \
|
||||
diagoff = 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ) \
|
||||
{ \
|
||||
offn_inc = 0; \
|
||||
\
|
||||
/* If the diagonal intersects the bottom side of the matrix,
|
||||
ignore the area to the right of that intersection. */ \
|
||||
if ( n > diagoff + m ) \
|
||||
{ \
|
||||
n = diagoff + m; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_prune_unstored_region_left_u( diagoff, m, n, offn_inc ) \
|
||||
{ \
|
||||
offn_inc = 0; \
|
||||
\
|
||||
/* If the diagonal intersects the top side of the matrix,
|
||||
ignore the area to the left of that intersection. */ \
|
||||
if ( diagoff > 0 ) \
|
||||
{ \
|
||||
n = n - diagoff; \
|
||||
offn_inc = + diagoff; \
|
||||
diagoff = 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_prune_unstored_region_bottom_u( diagoff, m, n, offm_inc ) \
|
||||
{ \
|
||||
offm_inc = 0; \
|
||||
\
|
||||
/* If the diagonal intersects the right side of the matrix,
|
||||
ignore the area below that intersection. */ \
|
||||
if ( m > -diagoff + n ) \
|
||||
{ \
|
||||
m = -diagoff + n; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
// thread range-related
|
||||
|
||||
#define bli_rotate180_trapezoid( diagoff, uplo ) \
|
||||
{ \
|
||||
diagoff = n - diagoff - m; \
|
||||
bli_toggle_uplo( uplo ); \
|
||||
}
|
||||
|
||||
#define bli_reverse_index_direction( start, end, n ) \
|
||||
{ \
|
||||
dim_t start2 = n - start; \
|
||||
dim_t end2 = n - end; \
|
||||
start = end2; \
|
||||
end = start2; \
|
||||
}
|
||||
|
||||
#define bli_reflect_about_diag( diagoff, uplo, m, n ) \
|
||||
{ \
|
||||
bli_swap_dims( m, n ); \
|
||||
bli_negate_diag_offset( diagoff ); \
|
||||
bli_toggle_uplo( uplo ); \
|
||||
}
|
||||
|
||||
|
||||
// mdim_t-related
|
||||
|
||||
#define bli_is_m_dim( mdim ) \
|
||||
\
|
||||
( mdim == BLIS_M )
|
||||
|
||||
#define bli_is_n_dim( mdim ) \
|
||||
\
|
||||
( mdim == BLIS_N )
|
||||
|
||||
#define bli_dim_toggled( mdim ) \
|
||||
\
|
||||
( mdim == BLIS_M ? BLIS_N : BLIS_M )
|
||||
|
||||
#define bli_toggle_dim( mdim ) \
|
||||
{ \
|
||||
mdim = bli_dim_toggled( mdim ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
// index-related
|
||||
|
||||
#define bli_is_edge_f( i1, iter, left ) \
|
||||
|
||||
@@ -243,6 +243,22 @@
|
||||
bli_fmax( bli_fabs( a ), \
|
||||
bli_fabs( b ) )
|
||||
|
||||
// round
|
||||
|
||||
#define bli_round( val ) \
|
||||
\
|
||||
( round( val ) )
|
||||
|
||||
// round_to_mult
|
||||
|
||||
#define bli_round_to_mult( val, mult ) \
|
||||
\
|
||||
( guint_t )( ( ( ( guint_t )val + \
|
||||
( guint_t )mult / 2 \
|
||||
) / mult \
|
||||
) * mult \
|
||||
)
|
||||
|
||||
// isnan, isinf
|
||||
|
||||
#define bli_isinf( a ) isinf( a )
|
||||
|
||||
@@ -591,10 +591,8 @@ typedef struct obj_s
|
||||
// Basic fields
|
||||
struct obj_s* root;
|
||||
|
||||
dim_t offm;
|
||||
dim_t offn;
|
||||
dim_t m;
|
||||
dim_t n;
|
||||
dim_t off[2];
|
||||
dim_t dim[2];
|
||||
doff_t diag_off;
|
||||
|
||||
objbits_t info;
|
||||
@@ -626,10 +624,10 @@ typedef struct obj_s
|
||||
{ \
|
||||
(b).root = (a).root; \
|
||||
\
|
||||
(b).offm = (a).offm; \
|
||||
(b).offn = (a).offn; \
|
||||
(b).m = (a).m; \
|
||||
(b).n = (a).n; \
|
||||
(b).off[0] = (a).off[0]; \
|
||||
(b).off[1] = (a).off[1]; \
|
||||
(b).dim[0] = (a).dim[0]; \
|
||||
(b).dim[1] = (a).dim[1]; \
|
||||
(b).diag_off = (a).diag_off; \
|
||||
\
|
||||
(b).info = (a).info; \
|
||||
@@ -669,8 +667,8 @@ typedef struct obj_s
|
||||
{ \
|
||||
(b).root = (a).root; \
|
||||
\
|
||||
(b).offm = (a).offm; \
|
||||
(b).offn = (a).offn; \
|
||||
(b).off[0] = (a).off[0]; \
|
||||
(b).off[1] = (a).off[1]; \
|
||||
/* Avoid copying m since it will be overwritten. */ \
|
||||
/* Avoid copying n since it will be overwritten. */ \
|
||||
(b).diag_off = (a).diag_off; \
|
||||
@@ -727,6 +725,15 @@ typedef enum
|
||||
} subpart_t;
|
||||
|
||||
|
||||
// -- Matrix dimension type --
|
||||
|
||||
typedef enum
|
||||
{
|
||||
BLIS_M = 0,
|
||||
BLIS_N = 1
|
||||
} mdim_t;
|
||||
|
||||
|
||||
// -- Machine parameter types --
|
||||
|
||||
typedef enum
|
||||
|
||||
@@ -113,6 +113,7 @@ extern "C" {
|
||||
#include "bli_pool.h"
|
||||
#include "bli_mem.h"
|
||||
#include "bli_part.h"
|
||||
#include "bli_prune.h"
|
||||
#include "bli_query.h"
|
||||
#include "bli_blocksize.h"
|
||||
#include "bli_func.h"
|
||||
|
||||
203
test/thread_ranges/Makefile
Normal file
203
test/thread_ranges/Makefile
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas at Austin nor the names
|
||||
# of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Field G. Van Zee
|
||||
#
|
||||
# Makefile for standalone BLIS test drivers.
|
||||
#
|
||||
|
||||
#
|
||||
# --- Makefile PHONY target definitions ----------------------------------------
|
||||
#
|
||||
|
||||
.PHONY: all \
|
||||
test-ranges \
|
||||
clean cleanx
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Makefile initialization --------------------------------------------------
|
||||
#
|
||||
|
||||
# Define the name of the configuration file.
|
||||
CONFIG_MK_FILE := config.mk
|
||||
|
||||
# Define the name of the file containing build and architecture-specific
|
||||
# makefile definitions.
|
||||
MAKE_DEFS_FILE := make_defs.mk
|
||||
|
||||
# Locations of important files.
|
||||
ROOT_PATH := ../..
|
||||
CONFIG_DIR := config
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include makefile configuration file --------------------------------------
|
||||
#
|
||||
|
||||
# Construct the path to the makefile configuration file that was generated by
|
||||
# the configure script.
|
||||
CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE)
|
||||
|
||||
# Include the configuration file.
|
||||
-include $(CONFIG_MK_PATH)
|
||||
|
||||
# Detect whether we actually got the configuration file. If we didn't, then
|
||||
# it is likely that the user has not yet generated it (via configure).
|
||||
ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes)
|
||||
CONFIG_MK_PRESENT := yes
|
||||
else
|
||||
CONFIG_MK_PRESENT := no
|
||||
endif
|
||||
|
||||
# Now we have access to CONFIG_NAME, which tells us which sub-directory of the
|
||||
# config directory to use as our configuration.
|
||||
CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include makefile definitions file ----------------------------------------
|
||||
#
|
||||
|
||||
# Construct the path to the makefile definitions file residing inside of
|
||||
# the configuration sub-directory.
|
||||
MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE)
|
||||
|
||||
# Include the makefile definitions file.
|
||||
-include $(MAKE_DEFS_MK_PATH)
|
||||
|
||||
# Detect whether we actually got the make definitios file. If we didn't, then
|
||||
# it is likely that the configuration is invalid (or incomplete).
|
||||
ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes)
|
||||
MAKE_DEFS_MK_PRESENT := yes
|
||||
else
|
||||
MAKE_DEFS_MK_PRESENT := no
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- BLAS and LAPACK implementations ------------------------------------------
|
||||
#
|
||||
|
||||
# BLIS library and header path. This is simply wherever it was installed.
|
||||
BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
|
||||
BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
|
||||
|
||||
# BLIS library.
|
||||
BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- General build definitions ------------------------------------------------
|
||||
#
|
||||
|
||||
TEST_SRC_PATH := .
|
||||
TEST_OBJ_PATH := .
|
||||
|
||||
# Gather all local object files.
|
||||
TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c))
|
||||
|
||||
# Override CFLAGS from make_defs.mk here, if desired.
|
||||
#CFLAGS := -g -O2 -march=native
|
||||
|
||||
# Add installed and local header paths to CFLAGS
|
||||
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH)
|
||||
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64
|
||||
LDFLAGS += -lgfortran -lm -lpthread -fopenmp
|
||||
|
||||
|
||||
# Datatype
|
||||
DT_S := -DDT=BLIS_FLOAT
|
||||
DT_D := -DDT=BLIS_DOUBLE
|
||||
DT_C := -DDT=BLIS_SCOMPLEX
|
||||
DT_Z := -DDT=BLIS_DCOMPLEX
|
||||
|
||||
# Problem size specification
|
||||
PDEF_MT := -DP_BEGIN=400 \
|
||||
-DP_END=8000 \
|
||||
-DP_INC=400
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
all: test-ranges
|
||||
|
||||
test-ranges: \
|
||||
test_ranges.x
|
||||
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# blis asm
|
||||
test_%.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) -c $< -o $@
|
||||
|
||||
|
||||
# -- Executable file rules --
|
||||
|
||||
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
|
||||
# on the link command line in case BLIS was configured with the BLAS
|
||||
# compatibility layer. This prevents BLIS from inadvertently getting called
|
||||
# for the BLAS routines we are trying to test with.
|
||||
|
||||
test_ranges.x: test_ranges.o $(BLIS_LIB)
|
||||
$(LINKER) $< $(BLIS_LIB) $(LDFLAGS) -o $@
|
||||
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
clean: cleanx
|
||||
|
||||
cleanx:
|
||||
- $(RM_F) *.o *.x
|
||||
|
||||
313
test/thread_ranges/test_ranges.c
Normal file
313
test/thread_ranges/test_ranges.c
Normal file
@@ -0,0 +1,313 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
bli_init();
|
||||
|
||||
#if 0
|
||||
obj_t a, b, c;
|
||||
obj_t aa, bb, cc;
|
||||
dim_t m, n, k;
|
||||
num_t dt;
|
||||
uplo_t uploa, uplob, uploc;
|
||||
|
||||
{
|
||||
dt = BLIS_DOUBLE;
|
||||
|
||||
m = 6;
|
||||
k = 6;
|
||||
n = 6;
|
||||
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
|
||||
uploa = BLIS_UPPER;
|
||||
uploa = BLIS_LOWER;
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, a );
|
||||
bli_obj_set_uplo( uploa, a );
|
||||
bli_obj_set_diag_offset( -2, a );
|
||||
|
||||
uplob = BLIS_UPPER;
|
||||
uplob = BLIS_LOWER;
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, b );
|
||||
bli_obj_set_uplo( uplob, b );
|
||||
bli_obj_set_diag_offset( -2, b );
|
||||
|
||||
uploc = BLIS_UPPER;
|
||||
//uploc = BLIS_LOWER;
|
||||
//uploc = BLIS_ZEROS;
|
||||
//uploc = BLIS_DENSE;
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, c );
|
||||
//bli_obj_set_struc( BLIS_TRIANGULAR, c );
|
||||
bli_obj_set_uplo( uploc, c );
|
||||
bli_obj_set_diag_offset( 1, c );
|
||||
|
||||
bli_obj_alias_to( a, aa ); (void)aa;
|
||||
bli_obj_alias_to( b, bb ); (void)bb;
|
||||
bli_obj_alias_to( c, cc ); (void)cc;
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
//bli_mkherm( &a );
|
||||
//bli_mktrim( &a );
|
||||
|
||||
bli_prune_unref_mparts( &cc, BLIS_M,
|
||||
&aa, BLIS_N );
|
||||
|
||||
bli_printm( "c orig", &c, "%4.1f", "" );
|
||||
bli_printm( "c alias", &cc, "%4.1f", "" );
|
||||
bli_printm( "a orig", &a, "%4.1f", "" );
|
||||
bli_printm( "a alias", &aa, "%4.1f", "" );
|
||||
//bli_obj_print( "a struct", &a );
|
||||
}
|
||||
#endif
|
||||
|
||||
dim_t p_begin, p_max, p_inc;
|
||||
gint_t m_input, n_input;
|
||||
char uploa_ch;
|
||||
doff_t diagoffa;
|
||||
dim_t bf;
|
||||
dim_t n_way;
|
||||
char part_dim_ch;
|
||||
bool_t go_fwd;
|
||||
char out_ch;
|
||||
|
||||
obj_t a;
|
||||
|
||||
thrinfo_t thrinfo;
|
||||
dim_t m, n;
|
||||
uplo_t uploa;
|
||||
bool_t part_m_dim, part_n_dim;
|
||||
bool_t go_bwd;
|
||||
dim_t p;
|
||||
num_t dt;
|
||||
dim_t start, end;
|
||||
|
||||
dim_t width;
|
||||
siz_t area;
|
||||
|
||||
gint_t t_begin, t_stop, t_inc;
|
||||
dim_t t;
|
||||
|
||||
if ( argc == 13 )
|
||||
{
|
||||
sscanf( argv[1], "%lu", &p_begin );
|
||||
sscanf( argv[2], "%lu", &p_max );
|
||||
sscanf( argv[3], "%lu", &p_inc );
|
||||
sscanf( argv[4], "%ld", &m_input );
|
||||
sscanf( argv[5], "%ld", &n_input );
|
||||
sscanf( argv[6], "%c", &uploa_ch );
|
||||
sscanf( argv[7], "%ld", &diagoffa );
|
||||
sscanf( argv[8], "%lu", &bf );
|
||||
sscanf( argv[9], "%lu", &n_way );
|
||||
sscanf( argv[10], "%c", &part_dim_ch );
|
||||
sscanf( argv[11], "%lu", &go_fwd );
|
||||
sscanf( argv[12], "%c", &out_ch );
|
||||
}
|
||||
else
|
||||
{
|
||||
printf( "\n" );
|
||||
printf( " %s\n", argv[0] );
|
||||
printf( "\n" );
|
||||
printf( " Simulate the dimension ranges assigned to threads when\n" );
|
||||
printf( " partitioning a matrix for parallelism in BLIS.\n" );
|
||||
printf( "\n" );
|
||||
printf( " Usage:\n" );
|
||||
printf( "\n" );
|
||||
printf( " %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] );
|
||||
printf( "\n" );
|
||||
printf( " p_beg: the first problem size p to test.\n" );
|
||||
printf( " p_max: the maximum problem size p to test.\n" );
|
||||
printf( " p_inc: the increase in problem size p between tests.\n" );
|
||||
printf( " m: the m dimension:\n" );
|
||||
printf( " n: the n dimension:\n" );
|
||||
printf( " if m,n = -1: bind m,n to problem size p.\n" );
|
||||
printf( " if m,n = 0: bind m,n to p_max.\n" );
|
||||
printf( " if m,n > 0: hold m,n = c constant for all p.\n" );
|
||||
printf( " uplo: the uplo field of the matrix being partitioned:\n" );
|
||||
printf( " 'l': lower-stored (BLIS_LOWER)\n" );
|
||||
printf( " 'u': upper-stored (BLIS_UPPER)\n" );
|
||||
printf( " 'd': densely-stored (BLIS_DENSE)\n" );
|
||||
printf( " doff: the diagonal offset of the matrix being partitioned.\n" );
|
||||
printf( " bf: the simulated blocking factor. all thread ranges must\n" );
|
||||
printf( " be a multiple of bf, except for the range that contains\n" );
|
||||
printf( " the edge case (if one exists). the blocking factor\n" );
|
||||
printf( " would typically correspond to a register blocksize.\n" );
|
||||
printf( " n_way: the number of ways of parallelism for which we are\n" );
|
||||
printf( " partitioning (i.e.: the number of threads, or thread\n" );
|
||||
printf( " groups).\n" );
|
||||
printf( " part_dim: the dimension to partition:\n" );
|
||||
printf( " 'm': partition the m dimension.\n" );
|
||||
printf( " 'n': partition the n dimension.\n" );
|
||||
printf( " go_fwd: the direction to partition:\n" );
|
||||
printf( " '1': forward, e.g. left-to-right (part_dim = 'm') or\n" );
|
||||
printf( " top-to-bottom (part_dim = 'n')\n" );
|
||||
printf( " '0': backward, e.g. right-to-left (part_dim = 'm') or\n" );
|
||||
printf( " bottom-to-top (part_dim = 'n')\n" );
|
||||
printf( " NOTE: reversing the direction does not change the\n" );
|
||||
printf( " subpartitions' widths, but it does change which end of\n" );
|
||||
printf( " the index range receives the edge case, if it exists.\n" );
|
||||
printf( " out: the type of output per thread-column:\n" );
|
||||
printf( " 'w': the width (and area) of the thread's subpartition\n" );
|
||||
printf( " 'r': the actual ranges of the thread's subpartition\n" );
|
||||
printf( " where the start and end points of each range are\n" );
|
||||
printf( " inclusive and exclusive, respectively.\n" );
|
||||
printf( "\n" );
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ( m_input == 0 ) m_input = p_max;
|
||||
if ( n_input == 0 ) n_input = p_max;
|
||||
|
||||
if ( part_dim_ch == 'm' ) { part_m_dim = TRUE; part_n_dim = FALSE; }
|
||||
else { part_m_dim = FALSE; part_n_dim = TRUE; }
|
||||
|
||||
go_bwd = !go_fwd;
|
||||
|
||||
if ( uploa_ch == 'l' ) uploa = BLIS_LOWER;
|
||||
else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER;
|
||||
else uploa = BLIS_DENSE;
|
||||
|
||||
if ( part_n_dim )
|
||||
{
|
||||
if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; }
|
||||
else /* if lower or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; }
|
||||
}
|
||||
else // if ( part_m_dim )
|
||||
{
|
||||
if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; }
|
||||
else /* if upper or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; }
|
||||
}
|
||||
|
||||
printf( "\n" );
|
||||
printf( " part: %3s doff: %3ld bf: %3ld output: %s\n",
|
||||
( part_n_dim ? ( go_fwd ? "l2r" : "r2l" )
|
||||
: ( go_fwd ? "t2b" : "b2t" ) ),
|
||||
diagoffa, bf,
|
||||
( out_ch == 'w' ? "width(area)" : "ranges" ) );
|
||||
printf( " uplo: %3c nt: %3ld\n", uploa_ch, n_way );
|
||||
printf( "\n" );
|
||||
|
||||
printf( " " );
|
||||
for ( t = t_begin; t != t_stop; t += t_inc )
|
||||
{
|
||||
if ( part_n_dim )
|
||||
{
|
||||
if ( t == t_begin ) printf( "left... " );
|
||||
else if ( t == t_stop-t_inc ) printf( " ...right" );
|
||||
else printf( " " );
|
||||
}
|
||||
else // if ( part_m_dim )
|
||||
{
|
||||
if ( t == t_begin ) printf( "top... " );
|
||||
else if ( t == t_stop-t_inc ) printf( " ...bottom" );
|
||||
else printf( " " );
|
||||
}
|
||||
}
|
||||
printf( "\n" );
|
||||
|
||||
|
||||
printf( "%4c x %4c ", 'm', 'n' );
|
||||
for ( t = t_begin; t != t_stop; t += t_inc )
|
||||
{
|
||||
printf( "%9s %lu ", "thread", t );
|
||||
}
|
||||
printf( "\n" );
|
||||
printf( "-------------" );
|
||||
for ( t = t_begin; t != t_stop; t += t_inc )
|
||||
{
|
||||
printf( "-------------" );
|
||||
}
|
||||
printf( "\n" );
|
||||
|
||||
|
||||
for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
{
|
||||
if ( m_input < 0 ) m = ( dim_t )p;
|
||||
else m = ( dim_t )m_input;
|
||||
if ( n_input < 0 ) n = ( dim_t )p;
|
||||
else n = ( dim_t )n_input;
|
||||
|
||||
dt = BLIS_DOUBLE;
|
||||
|
||||
bli_obj_create( dt, m, n, 0, 0, &a );
|
||||
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, a );
|
||||
bli_obj_set_uplo( uploa, a );
|
||||
bli_obj_set_diag_offset( diagoffa, a );
|
||||
|
||||
bli_randm( &a );
|
||||
|
||||
printf( "%4lu x %4lu ", m, n );
|
||||
|
||||
for ( t = t_begin; t != t_stop; t += t_inc )
|
||||
{
|
||||
thrinfo.n_way = n_way;
|
||||
thrinfo.work_id = t;
|
||||
|
||||
if ( part_n_dim && go_fwd )
|
||||
area = bli_get_range_weighted_l2r( &thrinfo, &a, bf, &start, &end );
|
||||
else if ( part_n_dim && go_bwd )
|
||||
area = bli_get_range_weighted_r2l( &thrinfo, &a, bf, &start, &end );
|
||||
else if ( part_m_dim && go_fwd )
|
||||
area = bli_get_range_weighted_t2b( &thrinfo, &a, bf, &start, &end );
|
||||
else // ( part_m_dim && go_bwd )
|
||||
area = bli_get_range_weighted_b2t( &thrinfo, &a, bf, &start, &end );
|
||||
|
||||
width = end - start;
|
||||
|
||||
if ( out_ch == 'w' ) printf( "%4lu(%6lu) ", width, area );
|
||||
else printf( "[%4lu,%4lu) ", start, end );
|
||||
}
|
||||
|
||||
printf( "\n" );
|
||||
|
||||
bli_obj_free( &a );
|
||||
}
|
||||
|
||||
bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user