Made framework changes to initialize specific cache block sizes for TRSM.

Details:
-This commit addresses the performance optimization(single-thread and
 multi-thread) for DTRSM on zen2.
-This new optimization employs different MC, KC & NC values for TRSM than
 what is being used in other Level-3 routines like DGEMM.
-Changed TRSM framework code to choose these blocksizes for TRSM
 on zen family configurations.
-Added a new field called "trsm_blkszs" to cntx structure in order to
 store TRSM specific block sizes.
-Implemented routines to initialize, set and query the TRSM-specific
 block sizes.
-Defined a new macro "AOCL_BLIS_ZEN" in configure script.
 This macro is automatically defined for zen family architectures.
 It enables us to choose different cache block sizes for TRSM instead of common level-3 block sizes.

Change-Id: Id8557b1c962a316b1edecca9cd582675eaf35fe6
Signed-off-by: Meghana Vankadari <meghana.vankadari@amd.com>
AMD-Internal: [CPUPL-656]
This commit is contained in:
Meghana Vankadari
2020-02-12 12:32:36 +05:30
parent f965b95d8b
commit cc98047fd6
13 changed files with 315 additions and 14 deletions

View File

@@ -45,6 +45,12 @@
// Enabled kernel sets (kernel_list)
@kernel_list_defines@
//This macro is enabled only for ZEN family configurations.
//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes.
#if @enable_aocl_zen@
#define AOCL_BLIS_ZEN
#endif
#if @enable_openmp@
#define BLIS_ENABLE_OPENMP
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -168,6 +168,20 @@ void bli_cntx_init_zen( cntx_t* cntx )
cntx
);
// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM execution.
bli_cntx_set_trsm_blkszs
(
5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -137,8 +137,28 @@ void bli_cntx_init_zen2( cntx_t* cntx )
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
//Initialize TRSM blocksize objects with architecture-specific values.
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
//Tuning is done for double-precision only.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM problems.
bli_cntx_set_trsm_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
// Initialize sup thresholds with architecture-appropriate values. s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 120, -1, -1 );

15
configure vendored
View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
# Copyright (C) 2020, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -3016,6 +3016,18 @@ main()
uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
config_name_define="#define BLIS_FAMILY_${uconf}\n"
#create a AOCL specific #define
#This macro is enabled only for zen family configurations.
#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
uconf=$(echo ${config_name} | grep -c 'zen' | cut -d. -f1)
if [[ $uconf == 1 ]]; then
enable_aocl_zen='yes'
enable_aocl_zen_01=1
else
enable_aocl_zen = 'no';
enable_aocl_zen_01=0;
fi
# Create a list of #defines, one for each configuration in config_list.
config_list_defines=""
for conf in ${config_list}; do
@@ -3126,6 +3138,7 @@ main()
| perl -pe "s/\@config_name_define\@/${config_name_define}/g" \
| perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \
| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen_01}/g" \
| sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \
| sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \

View File

@@ -1,10 +1,11 @@
/*
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,7 +35,6 @@
#include "blis.h"
dim_t bli_l3_determine_kc
(
dir_t direct,
@@ -311,7 +311,7 @@ dim_t PASTEMAC0(opname) \
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
bsize = TRSM_BLKSZ_FUNC( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -80,9 +80,15 @@ void bli_trsm_blk_var1
{
obj_t a11_1, c1_1;
//For zen architectures, TRSM uses different MC, KC and NC blocking sizes than other Level-3 routines.
//Hence calling a different function to query TRSM-specific block sizes for zen family.
#ifdef AOCL_BLIS_ZEN
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, &a11,
bli_cntl_bszid( cntl ), cntx );
#else
b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
bli_cntl_bszid( cntl ), cntx );
#endif
// Acquire partitions for A1 and C1.
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &a11, &a11_1 );

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -67,8 +67,15 @@ void bli_trsm_blk_var2
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
//For zen family, TRSM uses different MC, KC and NC blocksizes than Level-3 routines.
//Hence calling a different function to query TRSM-specific block sizes for zen family.
#ifdef AOCL_BLIS_ZEN
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
#else
b_alg = bli_determine_blocksize( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
#endif
// Acquire partitions for B1 and C1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -297,6 +298,78 @@ dim_t bli_determine_blocksize_b
return b_use;
}
#ifdef AOCL_BLIS_ZEN
dim_t bli_determine_blocksize_trsm
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
if ( direct == BLIS_FWD )
return bli_determine_blocksize_trsm_f( i, dim, obj, bszid, cntx );
else
return bli_determine_blocksize_trsm_b( i, dim, obj, bszid, cntx );
}
dim_t bli_determine_blocksize_trsm_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;
// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
return b_use;
}
dim_t bli_determine_blocksize_trsm_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;
// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
return b_use;
}
#endif
dim_t bli_determine_blocksize_f_sub
(
dim_t i,

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -278,6 +279,38 @@ dim_t bli_determine_blocksize_b
cntx_t* cntx
);
#ifdef AOCL_BLIS_ZEN
dim_t bli_determine_blocksize_trsm
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
dim_t bli_determine_blocksize_trsm_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
dim_t bli_determine_blocksize_trsm_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
#endif
dim_t bli_determine_blocksize_f_sub
(
dim_t i,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -1261,6 +1261,111 @@ void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... )
// -----------------------------------------------------------------------------
#ifdef AOCL_BLIS_ZEN
void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... )
{
// This function should be called from the bli_cntx_init_*() function for
// zen family architectures to set TRSM blocksizes. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// blocksizes across all datatypes.
/* Example prototypes:
void bli_cntx_set_trsm_blkszs
(
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0,
bszid_t bs1_id, blksz_t* blksz1,
bszid_t bs2_id, blksz_t* blksz2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_bs );
// Process n_bs tuples.
for ( i = 0; i < n_bs; ++i )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object.
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
blkszs[ i ] = blksz;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the blocksize object array
blksz_t* cntx_l3_trsm_blkszs = bli_cntx_trsm_blkszs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context. Notice that the blksz_t* pointers were saved, rather than
// the objects themselves, but we copy the contents of the objects
// when copying into the context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
bszid_t bs_id = bszids[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* cntx_l3_trsm_blksz = &cntx_l3_trsm_blkszs[ bs_id ];
// Copy the blksz_t object contents into the appropriate
// location within the context's blksz_t array.
//cntx_trsm_blkszs[ bs_id ] = *blksz;
//bli_blksz_copy( blksz, cntx_trsm_blksz );
bli_blksz_copy_if_pos( blksz, cntx_l3_trsm_blksz );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( blkszs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( bszids );
}
#endif
// -----------------------------------------------------------------------------
void bli_cntx_set_l1f_kers( dim_t n_kers, ... )
{
// This function can be called from the bli_cntx_init_*() function for

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -84,6 +84,10 @@ static bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
{
return cntx->bmults;
}
static blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx )
{
return cntx->trsm_blkszs;
}
static func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
{
return cntx->l3_vir_ukrs;
@@ -333,6 +337,16 @@ static blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
return blksz;
}
static blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
// Return the address of the blksz_t identified by bs_id.
return blksz;
}
static dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
@@ -735,6 +749,8 @@ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -217,6 +218,11 @@
#endif
#ifdef AOCL_BLIS_ZEN
#define TRSM_BLKSZ_FUNC bli_cntx_get_trsm_blksz
#else
#define TRSM_BLKSZ_FUNC bli_cntx_get_blksz
#endif
#endif

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -1420,6 +1420,8 @@ typedef struct cntx_s
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
bszid_t bmults[ BLIS_NUM_BLKSZS ];
blksz_t trsm_blkszs[ BLIS_NUM_BLKSZS ];
func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ];