diff --git a/CREDITS b/CREDITS index 17e9e14f2..f1fa0b71f 100644 --- a/CREDITS +++ b/CREDITS @@ -35,13 +35,14 @@ but many others have contributed code and feedback, including Tony Kelman @tkelman Lee Killough (Cray) Mike Kistler (IBM, Austin Research Laboratory) + Michael Lehn @michael-lehn + Dave Love @loveshack + Tze Meng Low (The University of Texas at Austin) + Ye Luo @ye-luo (Argonne National Laboratory) Ricardo Magana @magania (Hewlett Packard Enterprise) Bryan Marker @bamarker (The University of Texas at Austin) Devin Matthews @devinamatthews (The University of Texas at Austin) Stefanos Mavros @smavros - Michael Lehn @michael-lehn - Dave Love @loveshack - Tze Meng Low (The University of Texas at Austin) Nisanth Padinharepatt (AMD) Devangi Parikh @dnparikh (The University of Texas at Austin) Elmar Peise @elmar-peise (RWTH-Aachen) diff --git a/build/add-copyright.py b/build/add-copyright.py index 0d5e52d5e..9a18b95fc 100755 --- a/build/add-copyright.py +++ b/build/add-copyright.py @@ -187,6 +187,8 @@ def main(): else: filename = git_words[1] + #my_echo( "-debug---- %s" % filename ) + # Start by opening the file. (We can assume it exists since it # was found by 'git status', so no need to check for existence.) # Read all lines in the file and then close it. @@ -203,7 +205,7 @@ def main(): # If the file does not have any copyright notice in it already, we # assume we don't need to update it. if not has_cr: - my_echo( "[skipped] %s" % filename ) + my_echo( "[nocrline] %s" % filename ) continue # Check whether the file already has a copyright for the_org. We may @@ -214,7 +216,7 @@ def main(): mod_file_lines = [] # At this point we know that the file has at least one copyright, and - # has_org_cr encodes whether already has a copyright for the_org. + # has_org_cr encodes whether it already has a copyright for the_org. # We process the files that we know already have copyrights for the_org # differently from the files that do not yet have them. @@ -240,12 +242,15 @@ def main(): repl_line = ' %s, ' % cur_year line_ny = re.sub( find_line, repl_line, line ) - my_echo( "[updated] %s" % filename ) + my_echo( "[updated ] %s" % filename ) # Add the updated line to the running list. mod_file_lines += line_ny else: + + my_echo( "[up2date ] %s" % filename ) + # Add the unchanged line to the running list. mod_file_lines += line @@ -262,7 +267,7 @@ def main(): # Don't go any further if we're only updating existing copyright # lines. if update_only: - my_echo( "[skipped] %s" % filename ) + my_echo( "[nocrline] %s" % filename ) continue num_file_lines = len( file_lines ) @@ -313,7 +318,7 @@ def main(): mod_file_lines += line mod_file_lines += line_nyno - my_echo( "[added ] %s" % filename ) + my_echo( "[added ] %s" % filename ) # endif resnext diff --git a/build/bli_config.h.in b/build/bli_config.h.in index b7e5adf85..2fa1fb127 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,6 +53,14 @@ #define BLIS_ENABLE_PTHREADS #endif +#if @enable_jrir_slab@ +#define BLIS_ENABLE_JRIR_SLAB +#endif + +#if @enable_jrir_rr@ +#define BLIS_ENABLE_JRIR_RR +#endif + #if @enable_packbuf_pools@ #define BLIS_ENABLE_PACKBUF_POOLS #endif diff --git a/build/irun.py b/build/irun.py new file mode 100755 index 000000000..97cc39c2f --- /dev/null +++ b/build/irun.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# Copyright (C) 2018, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Import modules +import os +import sys +import getopt +import re +import subprocess +import time +import statistics + + +def print_usage(): + + my_print( " " ) + my_print( " %s" % script_name ) + my_print( " " ) + my_print( " Field G. Van Zee" ) + my_print( " " ) + my_print( " Repeatedly run a test driver and accumulate statistics for the" ) + my_print( " output." ) + my_print( " " ) + my_print( " Usage:" ) + my_print( " " ) + my_print( " %s [options] drivername" % script_name ) + my_print( " " ) + my_print( " Arguments:" ) + my_print( " " ) + my_print( " drivername The filename/path of the test driver to run. The" ) + my_print( " test driver must output its performance data to" ) + my_print( " standard output." ) + my_print( " " ) + my_print( " The following options are accepted:" ) + my_print( " " ) + my_print( " -c num performance column index" ) + my_print( " Find the performance result in column index of" ) + my_print( " the test driver's output. Here, a column is defined" ) + my_print( " as a contiguous sequence of non-whitespace characters," ) + my_print( " with the column indices beginning at 0. By default," ) + my_print( " the second-to-last column index in the output is used." ) + my_print( " " ) + my_print( " -d delay sleep() delay" ) + my_print( " Wait seconds after each execution of the" ) + my_print( " test driver. The default delay is 0." ) + my_print( " " ) + my_print( " -n niter number of iterations" ) + my_print( " Execute the test driver times. The default" ) + my_print( " value is 10." ) + my_print( " " ) + my_print( " -q quiet; summary only" ) + my_print( " Do not output statistics after every new execution of" ) + my_print( " the test driver; instead, only output the final values" ) + my_print( " after all iterations are complete. The default is to" ) + my_print( " output updated statistics after each iteration." ) + my_print( " " ) + my_print( " -h help" ) + my_print( " Output this information and exit." ) + my_print( " " ) + + +# ------------------------------------------------------------------------------ + +def my_print( s ): + + sys.stdout.write( "%s\n" % s ) + #sys.stdout.flush() + +# ------------------------------------------------------------------------------ + +# Global variables. +script_name = None +output_name = None + +def main(): + + global script_name + global output_name + + # Obtain the script name. + path, script_name = os.path.split(sys.argv[0]) + + output_name = script_name + + # Default values for optional arguments. + #perf_col = 9 + perf_col = -1 + delay = 0 + niter = 10 + quiet = False + + # Process our command line options. + try: + opts, args = getopt.getopt( sys.argv[1:], "c:d:n:hq" ) + + except getopt.GetoptError as err: + # print help information and exit: + my_print( str(err) ) # will print something like "option -a not recognized" + print_usage() + sys.exit(2) + + for opt, optarg in opts: + if opt == "-c": + perf_col = optarg + elif opt == "-d": + delay = optarg + elif opt == "-n": + niter = optarg + elif opt == "-q": + quiet = True + elif opt == "-h": + print_usage() + sys.exit() + else: + print_usage() + sys.exit() + + # Print usage if we don't have exactly one argument. + if len( args ) != 1: + print_usage() + sys.exit() + + # Acquire our only mandatory argument: the name of the test driver. + driverfile = args[0] + + #my_print( "test driver: %s" % driverfile ) + #my_print( "column num: %s" % perf_col ) + #my_print( "delay: %s" % delay ) + #my_print( "num iter: %s" % niter ) + + # Build a list of iterations. + iters = range( int(niter) ) + + # Run the test driver once to detect the number of lines of output. + p = subprocess.run( driverfile, stdout=subprocess.PIPE ) + lines0 = p.stdout.decode().splitlines() + num_lines0 = int(len(lines0)) + + # Initialize the list of lists (one list per performance result). + aperf = [] + for i in range( num_lines0 ): + aperf.append( [] ) + + for it in iters: + + # Run the test driver. + p = subprocess.run( driverfile, stdout=subprocess.PIPE ) + + # Acquire the lines of output. + lines = p.stdout.decode().splitlines() + + # Accumulate the test driver's latest results into aperf. + for i in range( num_lines0 ): + + # Parse the current line to find the performance value. + line = lines[i] + words = line.split() + if perf_col == -1: + perf = words[ len(words)-2 ] + else: + perf = words[ int(perf_col) ] + + # As unlikely as it is, guard against Inf and NaN. + if float(perf) == float('Inf') or \ + float(perf) == -float('Inf') or \ + float(perf) == float('NaN'): perf = 0.0 + + # Add the performance value to the list at the ith entry of aperf. + aperf[i].append( float(perf) ) + + # Compute stats for the current line. + avgp = statistics.mean( aperf[i] ) + maxp = max( aperf[i] ) + minp = min( aperf[i] ) + + # Only compute stdev() when we have two or more data points. + if len( aperf[i] ) > 1: stdp = statistics.stdev( aperf[i] ) + else: stdp = 0.0 + + # Construct a string to match the performance value and then + # use that string to search-and-replace with four format specs + # for the min, avg, max, and stdev values computed above. + search = '%8s' % perf + newline = re.sub( str(search), ' %7.2f %7.2f %7.2f %6.2f', line ) + + # Search for the column index range that would be present if this were + # matlab-compatible output. The index range will typically be 1:n, + # where n is the number of columns of data. + found_index = False + for word in words: + if re.match( '1:', word ): + index_str = word + found_index = True + break + + # If we find the column index range, we need to update it to reflect + # the replacement of one column of data with four, for a net increase + # of columns. We do so via another instance of re.sub() in which we + # search for the old index string and replace it with the new one. + if found_index: + last_col = int(index_str[2]) + 3 + new_index_str = '1:%1s' % last_col + newline = re.sub( index_str, new_index_str, newline ) + + # If the quiet flag was not give, output the intermediate results. + if not quiet: + print( newline % ( float(minp), float(avgp), float(maxp), float(stdp) ) ) + + # Flush stdout after each set of output prior to sleeping. + sys.stdout.flush() + + # Sleep for a bit until the next iteration. + time.sleep( int(delay) ) + + # If the quiet flag was given, output the final results. + if quiet: + + for i in range( num_lines0 ): + + # Parse the current line to find the performance value (only + # needed for call to re.sub() below). + line = lines0[i] + words = line.split() + if perf_col == -1: + perf = words[ len(words)-2 ] + else: + perf = words[ int(perf_col) ] + + # Compute stats for the current line. + avgp = statistics.mean( aperf[i] ) + maxp = max( aperf[i] ) + minp = min( aperf[i] ) + + # Only compute stdev() when we have two or more data points. + if len( aperf[i] ) > 1: stdp = statistics.stdev( aperf[i] ) + else: stdp = 0.0 + + # Construct a string to match the performance value and then + # use that string to search-and-replace with four format specs + # for the min, avg, max, and stdev values computed above. + search = '%8s' % perf + newline = re.sub( str(search), ' %7.2f %7.2f %7.2f %6.2f', line ) + + # Search for the column index range that would be present if this were + # matlab-compatible output. The index range will typically be 1:n, + # where n is the number of columns of data. + found_index = False + for word in words: + if re.match( '1:', word ): + index_str = word + found_index = True + break + + # If we find the column index range, we need to update it to reflect + # the replacement of one column of data with four, for a net increase + # of columns. We do so via another instance of re.sub() in which we + # search for the old index string and replace it with the new one. + if found_index: + last_col = int(index_str[2]) + 3 + new_index_str = '1:%1s' % last_col + newline = re.sub( index_str, new_index_str, newline ) + + # Output the results for the current line. + print( newline % ( float(minp), float(avgp), float(maxp), float(stdp) ) ) + + # Flush stdout afterwards. + sys.stdout.flush() + + + # Return from main(). + return 0 + + + + +if __name__ == "__main__": + main() diff --git a/common.mk b/common.mk index c4ea93e22..42be1590b 100644 --- a/common.mk +++ b/common.mk @@ -438,7 +438,7 @@ INSTALL := install -c # Script for creating a monolithic header file. #FLATTEN_H := $(DIST_PATH)/build/flatten-headers.sh -FLATTEN_H := $(DIST_PATH)/build/flatten-headers.py +FLATTEN_H := $(PYTHON) $(DIST_PATH)/build/flatten-headers.py # Default archiver flags. ARFLAGS := cr diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 35478e0f4..5c3cc8d04 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -51,7 +51,13 @@ THIS_CONFIG := bgq # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk +ifeq ($(CC_VENDOR),ibm) CMISCFLAGS := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist +else ifeq ($(CC_VENDOR),clang) +CMISCFLAGS := -fopenmp +else +$(error xlc or bgclang is required for this configuration.) +endif CPICFLAGS := CWARNFLAGS := -w @@ -69,8 +75,6 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),ibm) CKVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa -else -$(error xlc is required for this configuration.) endif # Flags specific to reference kernels. @@ -78,7 +82,11 @@ CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) # Override the default value for LDFLAGS. +ifeq ($(CC_VENDOR),ibm) LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -qthreaded -qsmp=omp +else ifeq ($(CC_VENDOR),clang) +LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -fopenmp +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index 9d9e16295..844a161f3 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -50,22 +50,22 @@ void bli_cntx_init_haswell( cntx_t* cntx ) 8, // gemm #if 1 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, #else - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_16x6, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_8x6, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_4x3, FALSE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE, #endif // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_zen_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_zen_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen_asm_6x8, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 6507f421b..7b4fa01e2 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -49,16 +49,16 @@ void bli_cntx_init_zen( cntx_t* cntx ) ( 8, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_zen_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_zen_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen_asm_6x8, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); diff --git a/config_registry b/config_registry index ffa24983e..0d1cbbe19 100644 --- a/config_registry +++ b/config_registry @@ -15,14 +15,14 @@ arm64: cortexa57 generic arm32: cortexa15 cortexa9 generic # Intel architectures. -skx: skx/skx/zen -knl: knl/knl/zen +skx: skx/skx/haswell/zen +knl: knl/knl/haswell/zen haswell: haswell/haswell/zen sandybridge: sandybridge penryn: penryn # AMD architectures. -zen: zen +zen: zen/zen/haswell excavator: excavator/piledriver steamroller: steamroller/piledriver piledriver: piledriver @@ -34,5 +34,8 @@ cortexa53: cortexa53/armv8a cortexa15: cortexa15/armv7a cortexa9: cortexa9/armv7a +# IBM architectures. +bgq: bgq + # Generic architectures. generic: generic diff --git a/configure b/configure index e5c17fd5f..9fcf5605f 100755 --- a/configure +++ b/configure @@ -163,9 +163,6 @@ print_usage() echo " incur additional overhead in some (but not all)" echo " situations." echo " " - echo " -q, --quiet Suppress informational output. By default, configure" - echo " is verbose. (NOTE: -q is not yet implemented)" - echo " " echo " -i SIZE, --int-size=SIZE" echo " " echo " Set the size (in bits) of internal BLIS integers and" @@ -230,6 +227,19 @@ print_usage() echo " detects the presence of libmemkind, libmemkind is used" echo " by default, and otherwise it is not used by default." echo " " + echo " -r METHOD, --thread-part-jrir=METHOD" + echo " " + echo " Request a method of assigning micropanels to threads in" + echo " the JR and IR loops. Valid options are 'slab' and 'rr'." + echo " Using 'slab' assigns (as much as possible) contiguous" + echo " regions of micropanels to each thread while the latter" + echo " assigns micropanels to threads in a round-robin fashion." + echo " (NOTE: Specifying this option constitutes a *request*," + echo " which may be ignored in select situations if the" + echo " implementation has a good reason to do so.) The chosen" + echo " method also applies during the packing of A and B. The" + echo " default method is 'slab'." + echo " " echo " --force-version=STRING" echo " " echo " Force configure to use an arbitrary version string" @@ -244,6 +254,9 @@ print_usage() echo " a sanity check to make sure these lists are constituted" echo " as expected." echo " " + echo " -q, --quiet Suppress informational output. By default, configure" + echo " is verbose. (NOTE: -q is not yet implemented)" + echo " " echo " -h, --help Output this information and quit." echo " " echo " Environment Variables:" @@ -1609,6 +1622,9 @@ main() # The threading flag. threading_model='no' + # The method of assigning micropanels to threads in the JR and JR loops. + thread_part_jrir='slab' + # Option variables. quiet_flag='' show_config_list='' @@ -1661,7 +1677,7 @@ main() # -- Command line option/argument parsing ---------------------------------- # Process our command line options. - while getopts ":hp:d:s:t:qci:b:-:" opt; do + while getopts ":hp:d:s:t:r:qci:b:-:" opt; do case $opt in -) case "$OPTARG" in @@ -1725,6 +1741,9 @@ main() enable-threading=*) threading_model=${OPTARG#*=} ;; + thread-part-jrir=*) + thread_part_jrir=${OPTARG#*=} + ;; disable-threading) threading_model='no' ;; @@ -1808,6 +1827,9 @@ main() t) threading_model=$OPTARG ;; + r) + thread_part_jrir=$OPTARG + ;; i) int_type_size=$OPTARG ;; @@ -1823,7 +1845,7 @@ main() esac done shift $(($OPTIND - 1)) - + # Parse environment variables while [ $# -gt 0 ]; do case $1 in @@ -2383,7 +2405,7 @@ main() elif [ "x${threading_model}" = "xpthreads" ] || [ "x${threading_model}" = "xpthread" ] || [ "x${threading_model}" = "xposix" ]; then - echo "${script_name}: using Pthreads for threading." + echo "${script_name}: using POSIX threads for threading." enable_pthreads='yes' enable_pthreads_01=1 threading_model="pthreads" # Standardize the value. @@ -2394,7 +2416,22 @@ main() echo "${script_name}: *** Unsupported threading model: ${threading_model}." exit 1 fi - + + # Check the method of assigning micropanels to threads in the JR and IR + # loops. + enable_jrir_slab_01=0 + enable_jrir_rr_01=0 + if [ "x${thread_part_jrir}" = "xslab" ]; then + echo "${script_name}: requesting slab threading in jr and ir loops." + enable_jrir_slab_01=1 + elif [ "x${thread_part_jrir}" = "xrr" ]; then + echo "${script_name}: requesting round-robin threading in jr and ir loops." + enable_jrir_rr_01=1 + else + echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}." + exit 1 + fi + # Convert 'yes' and 'no' flags to booleans. if [ "x${enable_packbuf_pools}" = "xyes" ]; then echo "${script_name}: internal memory pools for packing buffers are enabled." @@ -2461,16 +2498,7 @@ main() else echo "${script_name}: mixed datatype support is disabled." - if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then - echo "${script_name}: *** Mixed datatype optimizations requiring extra memory are only" - echo "${script_name}: *** available when mixed datatype support is also enabled." - echo "${script_name}: *** Please enable mixed datatype support, or disable mixed datatype" - echo "${script_name}: *** optimizations requiring extra memory, and re-run configure." - exit 1 - else - enable_mixed_dt_extra_mem_01=0 - fi - + enable_mixed_dt_extra_mem_01=0 enable_mixed_dt_01=0 fi @@ -2649,6 +2677,8 @@ main() | perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \ | sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \ | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ + | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ + | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ | sed -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \ | sed -e "s/@int_type_size@/${int_type_size}/g" \ | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ @@ -2742,7 +2772,7 @@ main() # -- Mirror source directory hierarchies to object directories ------------- - + # Combine the config_list with the config_name and then remove duplicates. config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}") diff --git a/docs/MixedDatatypes.md b/docs/MixedDatatypes.md index 90c2a8703..ce9981389 100644 --- a/docs/MixedDatatypes.md +++ b/docs/MixedDatatypes.md @@ -205,6 +205,17 @@ operands and thus fixed; the user may not specify a different computation domain, even if the mixed-domain case would reasonably allow for computing in either domain. +* **Sandboxes should be used with caution.** When building a `gemm` sandbox in +BLIS, please consider either (a) disabling mixed datatype support, or (b) +consciously **never** running the testsuite with mixed domain or precision +computation enabled. Even the reference `ref99` sandbox implementation in BLIS +does not support mixing datatypes. If you do choose to enable a sandbox while +also keeping mixed datatype support enabled in BLIS, make sure that the +mixing of datatypes is disabled in the testsuite's `input.general` file +(unless, of course, you decide to implement all mixed datatype cases within +your sandbox). This issue is also discussed in the documentation for +[Sandboxes](Sandboxes.md#known-issues). + ## Conclusion For more information and documentation on BLIS, please visit the [BLIS github page](https://github.com/flame/blis/). diff --git a/docs/Sandboxes.md b/docs/Sandboxes.md index 896e7332e..a205be02c 100644 --- a/docs/Sandboxes.md +++ b/docs/Sandboxes.md @@ -4,6 +4,7 @@ * **[Enabling a sandbox](Sandboxes.md#enabling-a-sandbox)** * **[Sandbox rules](Sandboxes.md#sandbox-rules)** * **[Caveats](Sandboxes.md#caveats)** +* **[Known Issues](Sandboxes.md#known-issues)** * **[Conclusion](Sandboxes.md#conclusion)** @@ -182,6 +183,20 @@ guidance from BLIS developers by opening a Notwithstanding these limitations, hopefully you still find BLIS sandboxes useful! +## Known Issues + +* **Mixed datatype support.** Unless you *really* know what you are doing, you +should probably disable mixed datatype support when using a sandbox. (Mixed +datatype support can be disabled by configuring with `--disable-mixed-dt`.) The +BLIS testsuite is smart enough to verify that you've configured BLIS with mixed +datatype support before allowing you to test with mixed domains/precisions +enabled in `input.general`. However, if those options *are* enabled and BLIS was +built with mixed datatype support, then BLIS assumes that the implementation of +`gemm` will support mixing of datatypes. BLIS *must* assume this, because +there's no way for it to confirm at runtime that an implementation was written +to support mixing datatypes. Note that even the `ref99` sandbox included with +BLIS does not support mixed-datatype computation. + ## Conclusion If you encounter any problems, or are really bummed-out that `gemm` is the diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 6c88ea893..194c66a65 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -39,9 +40,7 @@ #include "bli_packm_part.h" -#include "bli_packm_unb_var1.h" - -#include "bli_packm_blk_var1.h" +#include "bli_packm_var.h" #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_4mi.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 195315886..3265b3beb 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,29 +37,30 @@ #define FUNCPTR_T packm_fp -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool_t invdiag, - bool_t revifup, - bool_t reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - void* packm_ker, - cntx_t* cntx, - thrinfo_t* thread - ); +typedef void (*FUNCPTR_T) + ( + struc_t strucc, + doff_t diagoffc, + diag_t diagc, + uplo_t uploc, + trans_t transc, + pack_t schema, + bool_t invdiag, + bool_t revifup, + bool_t reviflo, + dim_t m, + dim_t n, + dim_t m_max, + dim_t n_max, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + inc_t is_p, + dim_t pd_p, inc_t ps_p, + void* packm_ker, + cntx_t* cntx, + thrinfo_t* thread + ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); @@ -195,7 +197,7 @@ void bli_packm_blk_var1 // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } - + // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_c, kappa_p ); } @@ -307,7 +309,7 @@ void PASTEMAC(ch,varname) \ ctype* restrict p_begin; \ \ dim_t iter_dim; \ - dim_t num_iter; \ + dim_t n_iter; \ dim_t it, ic, ip; \ dim_t ic0, ip0; \ doff_t ic_inc, ip_inc; \ @@ -418,16 +420,16 @@ void PASTEMAC(ch,varname) \ else { ss_num = 1; ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ - num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ /* Set the initial values and increments for indices related to C and P based on whether reverse iteration was requested. */ \ if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ { \ - ic0 = (num_iter - 1) * panel_dim_max; \ + ic0 = (n_iter - 1) * panel_dim_max; \ ic_inc = -panel_dim_max; \ - ip0 = num_iter - 1; \ + ip0 = n_iter - 1; \ ip_inc = -1; \ } \ else \ @@ -440,16 +442,21 @@ void PASTEMAC(ch,varname) \ \ p_begin = p_cast; \ \ -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ @@ -514,7 +521,11 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ /* We nudge the imaginary stride up by one if it is odd. */ \ is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ \ - if( packm_thread_my_iter( it, thread ) ) \ + /* NOTE: We MUST use round-robin partitioning when packing + micropanels of a triangular matrix. Hermitian/symmetric + and general packing may use slab or round-robin, depending + on which was selected at configure-time. */ \ + if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \ { \ packm_ker_cast( strucc, \ diagoffp_i, \ @@ -553,7 +564,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ \ is_p_use = is_p; \ \ - if( packm_thread_my_iter( it, thread ) ) \ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ packm_ker_cast( strucc, \ diagoffc_i, \ @@ -589,7 +602,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ \ is_p_use = is_p; \ \ - if( packm_thread_my_iter( it, thread ) ) \ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ packm_ker_cast( BLIS_GENERAL, \ 0, \ @@ -613,6 +628,23 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ p_inc = ps_p; \ } \ \ + p_begin += p_inc; \ +\ + } \ +} + +INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 ) + + + +/* +if ( row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ + c_cast, rs_c, cs_c, "%4.1f", "" ); \ +if ( col_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ + c_cast, rs_c, cs_c, "%4.1f", "" ); \ +*/ /* if ( row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ @@ -671,8 +703,7 @@ bli_thread_obarrier( thread ); \ } \ bli_thread_obarrier( thread ); \ } \ -*/ \ -\ +*/ /* if ( bli_is_4mi_packed( schema ) ) { \ printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ @@ -695,18 +726,11 @@ bli_thread_obarrier( thread ); \ ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ } \ } \ -*/ \ -/* -*/ \ -\ -/* -*/ \ +*/ /* PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ -\ -\ +*/ /* if ( row_stored ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ @@ -719,9 +743,7 @@ bli_thread_obarrier( thread ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ } \ -*/ \ -\ -\ +*/ /* if ( col_stored ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ @@ -733,12 +755,4 @@ bli_thread_obarrier( thread ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ } \ -*/ \ -\ - p_begin += p_inc; \ -\ - } \ -} - -INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 ) - +*/ diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c index 4efd0074c..0930f282b 100644 --- a/frame/1m/packm/bli_packm_blk_var1_md.c +++ b/frame/1m/packm/bli_packm_blk_var1_md.c @@ -146,7 +146,7 @@ void PASTEMAC2(chc,chp,varname) \ ctype_p* restrict p_begin; \ \ dim_t iter_dim; \ - dim_t num_iter; \ + dim_t n_iter; \ dim_t it, ic, ip; \ doff_t ic_inc, ip_inc; \ dim_t panel_len_full; \ @@ -220,7 +220,7 @@ void PASTEMAC2(chc,chp,varname) \ } \ \ /* Compute the total number of iterations we'll need. */ \ - num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ { \ ic_inc = panel_dim_max; \ @@ -229,16 +229,25 @@ void PASTEMAC2(chc,chp,varname) \ \ p_begin = p_cast; \ \ -/* -if ( row_stored ) \ -PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: b orig", m, n, \ - c_cast, rs_c, cs_c, "%5.2f", "" ); \ -if ( col_stored ) \ -PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \ - c_cast, rs_c, cs_c, "%5.2f", "" ); \ -*/ \ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ \ - for ( ic = 0, ip = 0, it = 0; it < num_iter; \ + /* Suppress unused variable warnings when slab partitioning is enabled, + since the slab-based definition of bli_packm_my_iter() does not + actually use tid or nt. */ \ + ( void )nt; ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + for ( ic = 0, ip = 0, it = 0; it < n_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ @@ -252,7 +261,7 @@ PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTEMAC2(chc,chp,packm_struc_cxk_md) \ ( \ diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 41d68d356..6c77caf35 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,7 +37,33 @@ // thrinfo_t macros specific to packm. // -#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +/* +#define bli_packm_thread_my_iter( index, thread ) \ +\ + ( index % thread->n_way == thread->work_id % thread->n_way ) +*/ + +#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ +\ + ( i % n_way == work_id % n_way ) + +#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ +\ + ( start <= i && i < end ) + +// Define a general-purpose version of bli_packm_my_iter() whose definition +// depends on whether slab or round-robin partitioning was requested at +// configure-time. +#ifdef BLIS_ENABLE_JRIR_SLAB + + #define bli_packm_my_iter bli_packm_my_iter_sl + +#else // BLIS_ENABLE_JRIR_RR + + #define bli_packm_my_iter bli_packm_my_iter_rr + +#endif + // // thrinfo_t APIs specific to packm. diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_var.h similarity index 72% rename from frame/1m/packm/bli_packm_blk_var1.h rename to frame/1m/packm/bli_packm_var.h index 396160da5..7531bc9cb 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,15 +33,50 @@ */ -void bli_packm_blk_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t +// +// Prototype object-based interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* c, \ + obj_t* p, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* t \ ); +GENPROT( packm_unb_var1 ) +GENPROT( packm_blk_var1 ) + +// +// Prototype BLAS-like interfaces with void pointer operands. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROT_BASIC0( packm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 228f22714..2110f1ec6 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,24 +39,34 @@ // gemm -#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) +// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to +// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. +#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // herk -#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) +// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to +// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. +#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm -#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to +// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. +#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) + +#define bli_trmm_my_iter_rr( index, thread ) \ +\ + ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm -#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trsm_my_iter_rr( index, thread ) \ +\ + ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 0c62b69ac..73b8bed06 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_gemm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 6a19e1bdb..3c25d7fa8 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_gemm_blk_var2 bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 2332a6cf7..975dc8d95 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -53,11 +54,19 @@ cntl_t* bli_gemmbp_cntl_create pack_t schema_b ) { - void* macro_kernel_p = bli_gemm_ker_var2; + void* macro_kernel_fp; + void* packa_fp; + void* packb_fp; - // Change the macro-kernel if the operation family is herk or trmm. - if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; - else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; + // Use the function pointers to the macrokernels that use slab + // assignment of micropanels to threads in the jr and ir loops. + if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; + else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2; + else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; + else /* should never execute */ macro_kernel_fp = NULL; + + packa_fp = bli_packm_blk_var1; + packb_fp = bli_packm_blk_var1; // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node @@ -72,7 +81,7 @@ cntl_t* bli_gemmbp_cntl_create ( family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() - macro_kernel_p, + macro_kernel_fp, gemm_cntl_bu_ke ); @@ -80,7 +89,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand - bli_packm_blk_var1, + packa_fp, BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal @@ -104,7 +113,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand - bli_packm_blk_var1, + packb_fp, BLIS_KR, BLIS_NR, FALSE, // do NOT invert diagonal diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 81552893a..07226388a 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -114,7 +115,8 @@ void bli_gemm_int if ( im != BLIS_NAT ) { - if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + if ( im == BLIS_4M1B ) + if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } } diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c index f7038584a..e60c78a5a 100644 --- a/frame/3/gemm/bli_gemm_ker_var1.c +++ b/frame/3/gemm/bli_gemm_ker_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,6 +33,8 @@ */ +#if 0 + #include "blis.h" void bli_gemm_ker_var1 @@ -55,3 +58,5 @@ void bli_gemm_ker_var1 bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread ); } +#endif + diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 1aa032ad9..cee050b85 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -132,7 +133,6 @@ void bli_gemm_ker_var2 // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. -#if 1 if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_l3_ind_recast_1m_params @@ -146,7 +146,6 @@ void bli_gemm_ker_var2 rs_c, cs_c ); } -#endif #ifdef BLIS_ENABLE_GEMM_MD // Tweak parameters in select mixed domain cases cases. @@ -300,17 +299,29 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - /* Save the desired output datatype (indicating no typecasting). */ \ - /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -325,7 +336,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -335,12 +346,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c index e414722b9..e52aa7f9e 100644 --- a/frame/3/gemm/bli_gemm_ker_var2_md.c +++ b/frame/3/gemm/bli_gemm_ker_var2_md.c @@ -273,14 +273,29 @@ void PASTEMAC2(chc,che,varname) \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype_e* restrict a1; \ ctype_c* restrict c11; \ @@ -295,7 +310,7 @@ void PASTEMAC2(chc,che,varname) \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype_e* restrict a2; \ \ @@ -305,12 +320,12 @@ void PASTEMAC2(chc,che,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 9baee6187..61a8136ec 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -58,6 +59,7 @@ GENPROT( gemm_packa ) GENPROT( gemm_packb ) GENPROT( gemm_ker_var1 ) + GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index 878889d2a..08992145a 100644 --- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + dim_t jr_inc = jr_num_threads; \ + dim_t ir_inc = ir_num_threads; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ @@ -295,12 +299,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c new file mode 100644 index 000000000..b48f46bc0 --- /dev/null +++ b/frame/3/gemm/other/bli_gemm_ker_var2.c @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); + + +void bli_gemm_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. +#if 1 + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } +#endif + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( gemm_ker_var2 ) + diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c new file mode 100644 index 000000000..3cb108eea --- /dev/null +++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c @@ -0,0 +1,380 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_gemm_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr ) + diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c new file mode 100644 index 000000000..3e9e28835 --- /dev/null +++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c @@ -0,0 +1,380 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_gemm_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl ) + diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index f45542d37..8dd94efbc 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -282,17 +283,57 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops for + the initial rectangular region of C (if it exists). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -307,7 +348,113 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -320,12 +467,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 3061a5c39..53f27cb92 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -117,7 +118,7 @@ void bli_herk_u_ker_var2 // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; + f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, @@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \ \ /* If there is a zero region to the left of where the diagonal of C intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ if ( diagoffc > 0 ) \ { \ jp = diagoffc / NR; \ @@ -282,17 +285,56 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + initial triangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -307,7 +349,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -320,12 +362,12 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -405,6 +447,114 @@ void PASTEMAC(ch,varname) \ } \ } \ } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Determine the thread range and increment for the 2nd loop of the + remaining rectangular region of C (and also use default partitioning + for the 1st loop). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ } INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h index 58061a8dd..d7cb75943 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/herk/bli_herk_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -56,6 +57,7 @@ void PASTEMAC0(opname) \ //GENPROT( herk_blk_var3 ) GENPROT( herk_x_ker_var2 ) + GENPROT( herk_l_ker_var2 ) GENPROT( herk_u_ker_var2 ) //GENPROT( herk_packa ) diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/herk/bli_herk_x_ker_var2.c index 10b6ab826..1dc95772a 100644 --- a/frame/3/herk/bli_herk_x_ker_var2.c +++ b/frame/3/herk/bli_herk_x_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c new file mode 100644 index 000000000..bd7b69e81 --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c @@ -0,0 +1,420 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); + + +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd and 1st loops. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/herk/other/bli_herk_l_ker_var2.c new file mode 100644 index 000000000..832421813 --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2.c @@ -0,0 +1,409 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); + + +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c new file mode 100644 index 000000000..7393f8e1b --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2rr.c @@ -0,0 +1,555 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_herk_l_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and 1st + loops for the initial rectangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and + 1st loops for the remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr ) + diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c new file mode 100644 index 000000000..569684bf7 --- /dev/null +++ b/frame/3/herk/other/bli_herk_l_ker_var2sl.c @@ -0,0 +1,556 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_herk_l_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use slab assignment of micropanels to threads in the 2nd and 1st + loops for the initial rectangular region of C (if it exists). */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd + loop and slab partitioning in the 1st loop for the remaining + triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c new file mode 100644 index 000000000..398213282 --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c @@ -0,0 +1,420 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); + + +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Use interleaved (round robin) assignment of micropanels to threads in + the 2nd and 1st loops. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/herk/other/bli_herk_u_ker_var2.c new file mode 100644 index 000000000..8d1a3021d --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2.c @@ -0,0 +1,409 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); + + +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c new file mode 100644 index 000000000..e0ac82745 --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2rr.c @@ -0,0 +1,557 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_herk_u_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and 1st + loops for the initial triangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and 1st + loops for the remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr ) + diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c new file mode 100644 index 000000000..b182561d7 --- /dev/null +++ b/frame/3/herk/other/bli_herk_u_ker_var2sl.c @@ -0,0 +1,558 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T herk_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_herk_u_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffc, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and slab partitioning in the 1st loop for the initial triangular region + of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd and 1st loops + loop for the remaining triangular region of C. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the edge of C and add the result. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl ) + diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 3778c7302..4d6b49a25 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -85,6 +86,10 @@ void bli_trmm_front } #if 0 + // NOTE: This case casts right-side trmm in terms of left side. This + // reduces the number of macrokernels exercised to two (trmm_ll and + // trmm_lu) but can lead to the microkernel being executed with an + // output matrix that is stored counter to its output preference. // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied @@ -98,6 +103,11 @@ void bli_trmm_front } #else + // NOTE: This case computes right-side trmm natively with trmm_rl and + // trmm_ru macrokernels. This code path always gives us the opportunity + // to transpose the entire operation so that the effective storage format + // of the output matrix matches the microkernel's output preference. + // Thus, from a performance perspective, this case is preferred. // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index eef104eed..a9df2571a 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -320,29 +321,45 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Determine the thread range and increment for the 2nd loop. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -372,17 +389,18 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -436,23 +454,24 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -501,17 +520,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 23dd22cb8..bb843c84d 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -327,29 +328,45 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Determine the thread range and increment for the 2nd loop. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -379,17 +396,18 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -443,23 +461,24 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -508,17 +527,13 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ + /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index ae44e8ff9..e03de3e08 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -327,15 +328,152 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of B, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. (There should never be any remainder in this division.) The + number of iterations in the triangular (or trapezoidal) region is + computed as the remaining number of iterations in the n dimension. */ \ + n_iter_rct = diagoffb / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops for + the initial rectangular region of B (if it exists). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and + 1st loops for the remaining triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ +\ + /* Advance the starting b1 and c1 pointers to the positions corresponding + to the start of the triangular region of B. */ \ + jr_start = n_iter_rct; \ + b1 = b_cast + jr_start * cstep_b; \ + c1 = c_cast + jr_start * cstep_c; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -361,7 +499,6 @@ void PASTEMAC(ch,varname) \ by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ @@ -369,7 +506,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_my_iter_rr( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -378,7 +515,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -389,11 +526,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -452,83 +589,6 @@ void PASTEMAC(ch,varname) \ \ b1 += ps_b_cur; \ } \ - else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ - { \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ -\ - /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t - object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ -\ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ - } \ -\ - b1 += cstep_b; \ - } \ \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 9d7ec4cfe..5261bf13f 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ - thrinfo_t* jr_thread \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ - dim_t i, j; \ + dim_t i, j, jb0; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ @@ -327,16 +328,58 @@ void PASTEMAC(ch,varname) \ \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in B. (There should never be any remainder + in this division.) The number of iterations in the rectangular region + is computed as the remaining number of iterations in the n dimension. */ \ + n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and + 1st loops for the initial triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ \ b1 = b_cast; \ c1 = c_cast; \ -\ - thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = 0; j < n_iter_tri; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -361,7 +404,6 @@ void PASTEMAC(ch,varname) \ by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ @@ -369,7 +411,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_my_iter_rr( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -378,7 +420,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -389,11 +431,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -452,30 +494,75 @@ void PASTEMAC(ch,varname) \ \ b1 += ps_b_cur; \ } \ - else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ - { \ - if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ + c1 += cstep_c; \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops for + the remaining rectangular region of B. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ + jb0 = n_iter_tri; \ +\ + /* Save the resulting value of b1 from the previous loop since it represents + the starting point for the rectangular region. */ \ + b_cast = b1; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + /* NOTE: We must index through b_cast differently since it contains + the starting address of the rectangular region (which is already + n_iter_tri logical iterations through B). */ \ + b1 = b_cast + (j-jb0) * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ - if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ -\ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -520,19 +607,12 @@ void PASTEMAC(ch,varname) \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ } \ -\ - c1 += cstep_c; \ } \ \ +\ +\ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index bde7977b5..4355fed71 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -56,6 +57,7 @@ void PASTEMAC0(opname) \ //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) + GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index d0e157877..df12c25ac 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c new file mode 100644 index 000000000..fbbbb7b2f --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c @@ -0,0 +1,519 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); + + +void bli_trmm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t off_a1011; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1011 = 0; \ + k_a1011 = bli_min( diagoffa_i + MR, k ); \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c new file mode 100644 index 000000000..a940fdb6f --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c @@ -0,0 +1,535 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trmm_ll_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t off_a1011; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1011 = 0; \ + k_a1011 = bli_min( diagoffa_i + MR, k ); \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr ) + diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c new file mode 100644 index 000000000..718c6fba1 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c @@ -0,0 +1,535 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trmm_ll_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t off_a1011; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). + NOTE: Parallelism in the 1st loop is disabled for now. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1011 = 0; \ + k_a1011 = bli_min( diagoffa_i + MR, k ); \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl ) + diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c new file mode 100644 index 000000000..2fe01d0e2 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c @@ -0,0 +1,527 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); + + +void bli_trmm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t off_a1112; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely below the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly above the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1112 = diagoffa_i; \ + k_a1112 = k - off_a1112; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c new file mode 100644 index 000000000..ab1efa46d --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c @@ -0,0 +1,542 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trmm_lu_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t off_a1112; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely below the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly above the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1112 = diagoffa_i; \ + k_a1112 = k - off_a1112; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr ) + diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c new file mode 100644 index 000000000..1bb4e1b6d --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c @@ -0,0 +1,542 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trmm_lu_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t off_a1112; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current block of A is entirely below the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ + dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ +\ + dim_t jr_start, jr_end; \ + /*dim_t ir_start, ir_end;*/ \ + dim_t jr_inc; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd loop for + the initial rectangular region of C (if it exists). */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, scale C + by beta. If it is strictly above the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict b1_i; \ + ctype* restrict a2; \ +\ + /* Determine the offset to and length of the panel that was + packed so we can index into the corresponding location in + b1. */ \ + off_a1112 = diagoffa_i; \ + k_a1112 = k - off_a1112; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + /* NOTE: ir loop parallelism disabled for now. */ \ + /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + /*}*/ \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl ) + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c new file mode 100644 index 000000000..860295c4c --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c @@ -0,0 +1,539 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); + + +void bli_trmm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t off_b1121; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to the beginning of the panel that + was packed so we can index into the corresponding location + in A. Then compute the length of that panel. */ \ + off_b1121 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b1121; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c new file mode 100644 index 000000000..1b1549951 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c @@ -0,0 +1,598 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trmm_rl_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t off_b1121; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of B, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. (There should never be any remainder in this division.) The + number of iterations in the triangular (or trapezoidal) region is + computed as the remaining number of iterations in the n dimension. */ \ + n_iter_rct = diagoffb / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and 1st + loops for the initial rectangular region of B (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + for the remaining triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ +\ + /* Advance the starting b1 and c1 pointers to the positions corresponding + to the start of the triangular region of B. */ \ + jr_start = n_iter_rct; \ + b1 = b_cast + jr_start * cstep_b; \ + c1 = c_cast + jr_start * cstep_c; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to the beginning of the panel that + was packed so we can index into the corresponding location + in A. Then compute the length of that panel. */ \ + off_b1121 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b1121; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr ) + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c new file mode 100644 index 000000000..80e9c7f2f --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c @@ -0,0 +1,598 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trmm_rl_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t off_b1121; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above the diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of B, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. (There should never be any remainder in this division.) The + number of iterations in the triangular (or trapezoidal) region is + computed as the remaining number of iterations in the n dimension. */ \ + n_iter_rct = diagoffb / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Use slab assignment of micropanels to threads in the 2nd and 1st + loops for the initial rectangular region of B (if it exists). */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + for the remaining triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ +\ + /* Advance the starting b1 and c1 pointers to the positions corresponding + to the start of the triangular region of B. */ \ + jr_start = n_iter_rct; \ + b1 = b_cast + jr_start * cstep_b; \ + c1 = c_cast + jr_start * cstep_c; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to the beginning of the panel that + was packed so we can index into the corresponding location + in A. Then compute the length of that panel. */ \ + off_b1121 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b1121; \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl ) + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c new file mode 100644 index 000000000..e0adf4cf2 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -0,0 +1,539 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); + + +void bli_trmm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* jr_thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t off_b0111; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ + dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b0111 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ + { \ + if ( bli_trmm_my_iter( j, jr_thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, ir_thread ) ) { \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 ) + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c new file mode 100644 index 000000000..ff118ab6d --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c @@ -0,0 +1,618 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trmm_ru_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t off_b0111; \ + dim_t i, j, jb0; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in B. (There should never be any remainder + in this division.) The number of iterations in the rectangular region + is computed as the remaining number of iterations in the n dimension. */ \ + n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + for the initial triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter_tri; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b0111 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd and 1st + loops the remaining triangular region of B. */ \ + bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ + jb0 = n_iter_tri; \ +\ + /* Save the resulting value of b1 from the previous loop since it represents + the starting point for the rectangular region. */ \ + b_cast = b1; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + /* NOTE: We must index through b_cast differently since it contains + the starting address of the rectangular region (which is already + n_iter_tri logical iterations through B). */ \ + b1 = b_cast + (j-jb0) * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ +\ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr ) + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c new file mode 100644 index 000000000..0fc2d514a --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c @@ -0,0 +1,618 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trmm_ru_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t off_b0111; \ + dim_t i, j, jb0; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full. For all trmm, k_full is simply k. This is + needed because some parameter combinations of trmm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of A (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = k; \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ + { \ + /* If the entire panel of B does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of B does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in B. (There should never be any remainder + in this division.) The number of iterations in the rectangular region + is computed as the remaining number of iterations in the n dimension. */ \ + n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + for the initial triangular region of B (if it exists). + NOTE: We don't need to call bli_thread_range_jrir*() here since we + employ a hack that calls for each thread to execute every iteration + of the jr and ir loops but skip all but the pointer increment for + iterations that are not assigned to it. */ \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter_tri; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ +\ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b0111 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + if ( bli_trmm_my_iter( j, thread ) ) { \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if ( bli_trmm_my_iter( i, caucus ) ) { \ +\ + ctype* restrict a1_i; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += ps_b_cur; \ + } \ +\ + c1 += cstep_c; \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd and 1st + loops the remaining triangular region of B. */ \ + bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ + jb0 = n_iter_tri; \ +\ + /* Save the resulting value of b1 from the previous loop since it represents + the starting point for the rectangular region. */ \ + b_cast = b1; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + /* NOTE: We must index through b_cast differently since it contains + the starting address of the rectangular region (which is already + n_iter_tri logical iterations through B). */ \ + b1 = b_cast + (j-jb0) * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, scale C + by beta. If it is strictly below the diagonal, scale by one. + This allows the current macro-kernel to work for both trmm + and trmm3. */ \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ + } \ +\ +\ +\ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl ) + diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 8b666b3f4..783572944 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_trsm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 6be5965a3..7286ba7e0 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,7 @@ void bli_trsm_blk_var2 bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index ee40189e5..24f8f37bf 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -53,7 +54,16 @@ cntl_t* bli_trsm_l_cntl_create pack_t schema_b ) { - void* macro_kernel_p = bli_trsm_xx_ker_var2; + void* macro_kernel_p; + void* packa_fp; + void* packb_fp; + + // Use the function pointer to the macrokernels that use slab + // assignment of micropanels to threads in the jr and ir loops. + macro_kernel_p = bli_trsm_xx_ker_var2; + + packa_fp = bli_packm_blk_var1; + packb_fp = bli_packm_blk_var1; const opid_t family = BLIS_TRSM; @@ -78,7 +88,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, - bli_packm_blk_var1, + packa_fp, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal @@ -102,7 +112,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, - bli_packm_blk_var1, + packb_fp, BLIS_MR, BLIS_NR, FALSE, // do NOT invert diagonal @@ -140,8 +150,12 @@ cntl_t* bli_trsm_r_cntl_create pack_t schema_b ) { + // NOTE: trsm macrokernels are presently disabled for right-side execution. void* macro_kernel_p = bli_trsm_xx_ker_var2; + void* packa_fp = bli_packm_blk_var1; + void* packb_fp = bli_packm_blk_var1; + const opid_t family = BLIS_TRSM; // Create two nodes for the macro-kernel. @@ -165,7 +179,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, - bli_packm_blk_var1, + packa_fp, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -189,7 +203,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, - bli_packm_blk_var1, + packb_fp, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 021f8baf2..c561de93d 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -342,25 +343,42 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Determine the thread range and increment for the 2nd loop. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if( bli_trsm_my_iter( j, thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1 + (0 )*rstep_c; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -408,12 +426,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -473,12 +490,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ - if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -534,10 +550,6 @@ void PASTEMAC(ch,varname) \ \ c11 += rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /* diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 0ddcd16d4..6db5c6569 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -350,25 +351,42 @@ void PASTEMAC(ch,varname) \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - b1 = b_cast; \ - c1 = c_cast; \ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Determine the thread range and increment for the 2nd loop. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ - if( bli_trsm_my_iter( j, thread ) ) { \ -\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1 + (m_iter-1)*rstep_c; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ @@ -418,12 +436,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ - if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -483,12 +500,11 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ - if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ - /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ @@ -544,10 +560,6 @@ void PASTEMAC(ch,varname) \ \ c11 -= rstep_c; \ } \ - } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /* diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 1cf456678..f69f5471d 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -430,7 +431,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( bli_trsm_my_iter( i, thread ) ){ \ + if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a11; \ ctype* restrict a12; \ @@ -444,12 +445,12 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ - if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ @@ -516,7 +517,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( bli_trsm_my_iter( i, thread ) ){ \ + if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a2; \ \ @@ -524,12 +525,12 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index b5a76d03a..2f3071d61 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -423,7 +424,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( bli_trsm_my_iter( i, thread ) ){ \ + if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a10; \ ctype* restrict a11; \ @@ -437,12 +438,12 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ - if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ @@ -509,7 +510,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( bli_trsm_my_iter( i, thread ) ){ \ + if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a2; \ \ @@ -517,12 +518,12 @@ void PASTEMAC(ch,varname) \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ - /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 5ac72c28c..ebd7afc2a 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -58,6 +59,7 @@ GENPROT( trsm_packa ) GENPROT( trsm_packb ) GENPROT( trsm_xx_ker_var2 ) + GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index 24d55af24..c8527f647 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c new file mode 100644 index 000000000..4e7e1b850 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c @@ -0,0 +1,593 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); + + +void bli_trsm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t k_a10; \ + dim_t off_a10; \ + dim_t off_a11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if( bli_trsm_my_iter( j, thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a10 = 0; \ + k_a1011 = diagoffa_i + MR; \ + k_a10 = k_a1011 - MR; \ + off_a11 = k_a10; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the panel A10 and the triangular + block A11. */ \ + a10 = a1; \ + /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ + a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ +*/ \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ + ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ +*/ \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ + +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ + ( double* )c, 1, cs_c, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ + ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c new file mode 100644 index 000000000..844d76ab7 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c @@ -0,0 +1,605 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trsm_ll_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t k_a10; \ + dim_t off_a10; \ + dim_t off_a11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a10 = 0; \ + k_a1011 = diagoffa_i + MR; \ + k_a10 = k_a1011 - MR; \ + off_a11 = k_a10; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the panel A10 and the triangular + block A11. */ \ + a10 = a1; \ + /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ + a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ +*/ \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ + ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ +*/ \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ + +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ + ( double* )c, 1, cs_c, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ + ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr ) + diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c new file mode 100644 index 000000000..e67de28fe --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c @@ -0,0 +1,605 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trsm_ll_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1011; \ + dim_t k_a10; \ + dim_t off_a10; \ + dim_t off_a11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of A intersects the + left edge of the block, adjust the pointer to C and treat this case as + if the diagonal offset were zero. This skips over the region that was + not packed. (Note we assume the diagonal offset is a multiple of MR; + this assumption will hold as long as the cache blocksizes are each a + multiple of MR and NR.) */ \ + if ( diagoffa < 0 ) \ + { \ + i = -diagoffa; \ + m = m - i; \ + diagoffa = 0; \ + c_cast = c_cast + (i )*rs_c; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd loop. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (0 )*rstep_c; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a10 = 0; \ + k_a1011 = diagoffa_i + MR; \ + k_a10 = k_a1011 - MR; \ + off_a11 = k_a10; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the panel A10 and the triangular + block A11. */ \ + a10 = a1; \ + /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ + a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 += rstep_c; \ + } \ + } \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ +*/ \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ + ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ +*/ \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ + +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ + ( double* )c, 1, cs_c, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ + ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl ) + diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c new file mode 100644 index 000000000..a8978df86 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c @@ -0,0 +1,574 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); + + +void bli_trsm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t k_a11; \ + dim_t k_a12; \ + dim_t off_a11; \ + dim_t off_a12; \ + dim_t i, j, ib; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + if( bli_trsm_my_iter( j, thread ) ) { \ +\ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( ib = 0; ib < m_iter; ++ib ) \ + { \ + i = m_iter - 1 - ib; \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a11 = diagoffa_i; \ + k_a1112 = k - off_a11;; \ + k_a11 = MR; \ + k_a12 = k_a1112 - MR; \ + off_a12 = off_a11 + k_a11; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the triangular block A11 and the + panel A12. */ \ + a11 = a1; \ + /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ + a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ + b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ + if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 -= rstep_c; \ + } \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ +printf( "m_iter = %lu\n", m_iter ); \ +printf( "m_cur = %lu\n", m_cur ); \ +printf( "k = %lu\n", k ); \ +printf( "diagoffa_i = %lu\n", diagoffa_i ); \ +printf( "off_a1112 = %lu\n", off_a1112 ); \ +printf( "k_a1112 = %lu\n", k_a1112 ); \ +printf( "k_a12 = %lu\n", k_a12 ); \ +printf( "k_a11 = %lu\n", k_a11 ); \ +printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ +printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c new file mode 100644 index 000000000..3d2792508 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c @@ -0,0 +1,586 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr); + +// +// -- Macrokernel functions for round-robin partitioning ----------------------- +// + +void bli_trsm_lu_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t k_a11; \ + dim_t k_a12; \ + dim_t off_a11; \ + dim_t off_a12; \ + dim_t i, j, ib; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( ib = 0; ib < m_iter; ++ib ) \ + { \ + i = m_iter - 1 - ib; \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a11 = diagoffa_i; \ + k_a1112 = k - off_a11;; \ + k_a11 = MR; \ + k_a12 = k_a1112 - MR; \ + off_a12 = off_a11 + k_a11; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the triangular block A11 and the + panel A12. */ \ + a11 = a1; \ + /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ + a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ + b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 -= rstep_c; \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ +printf( "m_iter = %lu\n", m_iter ); \ +printf( "m_cur = %lu\n", m_cur ); \ +printf( "k = %lu\n", k ); \ +printf( "diagoffa_i = %lu\n", diagoffa_i ); \ +printf( "off_a1112 = %lu\n", off_a1112 ); \ +printf( "k_a1112 = %lu\n", k_a1112 ); \ +printf( "k_a12 = %lu\n", k_a12 ); \ +printf( "k_a11 = %lu\n", k_a11 ); \ +printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ +printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr ) + diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c new file mode 100644 index 000000000..486294352 --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c @@ -0,0 +1,586 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffa, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl); + +// +// -- Macrokernel functions for slab partitioning ------------------------------ +// + +void bli_trsm_lu_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffa, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffa, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffa_i; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_a1112; \ + dim_t k_a11; \ + dim_t k_a12; \ + dim_t off_a11; \ + dim_t off_a12; \ + dim_t i, j, ib; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_a_num; \ + inc_t ss_a_den; \ + inc_t ps_a_cur; \ + inc_t is_a_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. + So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of MR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_a ) || \ + bli_is_3mi_packed( schema_a ) || \ + bli_is_rih_packed( schema_a ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of A + intersects the top edge of the block, adjust the pointer to B and + treat this case as if the diagonal offset were zero. Note that we + don't need to adjust the pointer to A since packm would have simply + skipped over the region that was not stored. */ \ + if ( diagoffa > 0 ) \ + { \ + i = diagoffa; \ + k = k - i; \ + diagoffa = 0; \ + b_cast = b_cast + ( i * PACKNR ) / off_scl; \ + } \ +\ + /* If there is a zero region below where the diagonal of A intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffa + k < m ) \ + { \ + m = -diagoffa + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of MR. If k + isn't a multiple of MR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an MR x MR triangular solve. + This adjustment of k is consistent with what happened when A was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of B. */ \ + if ( k % MR != 0 ) k += MR - ( k % MR ); \ +\ + /* NOTE: We don't need to check that m is a multiple of PACKMR since we + know that the underlying buffer was already allocated to have an m + dimension that is a multiple of PACKMR, with the region between the + last row and the next multiple of MR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k; \ + istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* We don't bother querying the thrinfo_t node for the 1st loop because + we can't parallelize that loop in trsm due to the inter-iteration + dependencies that exist. */ \ + /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ +\ + dim_t jr_start, jr_end; \ + dim_t jr_inc; \ +\ + /* Use slab assignment of micropanels to threads in the 2nd loop. + NOTE: Parallelism in the 1st loop is unattainable due to the + inter-iteration dependencies present in trsm. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + a1 = a_cast; \ + c11 = c1 + (m_iter-1)*rstep_c; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( ib = 0; ib < m_iter; ++ib ) \ + { \ + i = m_iter - 1 - ib; \ + diagoffa_i = diagoffa + ( doff_t )i*MR; \ +\ + m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ +\ + /* If the current panel of A intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of A resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict a2; \ +\ + /* Compute various offsets into and lengths of parts of A. */ \ + off_a11 = diagoffa_i; \ + k_a1112 = k - off_a11;; \ + k_a11 = MR; \ + k_a12 = k_a1112 - MR; \ + off_a12 = off_a11 + k_a11; \ +\ + /* Compute the panel stride for the current diagonal- + intersecting micro-panel. */ \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ +\ + /* Compute the addresses of the triangular block A11 and the + panel A12. */ \ + a11 = a1; \ + /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ + a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ +\ + /* Compute the addresses of the panel B01 and the block + B11. */ \ + b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ + b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + ps_a_cur; \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( is_a_cur, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += ps_a_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ + { \ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t + object. */ \ + bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + a1 += rstep_a; \ + } \ +\ + c11 -= rstep_c; \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ +printf( "m_iter = %lu\n", m_iter ); \ +printf( "m_cur = %lu\n", m_cur ); \ +printf( "k = %lu\n", k ); \ +printf( "diagoffa_i = %lu\n", diagoffa_i ); \ +printf( "off_a1112 = %lu\n", off_a1112 ); \ +printf( "k_a1112 = %lu\n", k_a1112 ); \ +printf( "k_a12 = %lu\n", k_a12 ); \ +printf( "k_a11 = %lu\n", k_a11 ); \ +printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ +printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ +*/ \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl ) + diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c new file mode 100644 index 000000000..70b3e456d --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c @@ -0,0 +1,591 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); + + +void bli_trsm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while + the current macro-kernel targets the "rl" case (right-side/lower- + triangular), it becomes upper-triangular after the kernel operation + is transposed so that all kernel instances are of the "left" + variety (since those are the only trsm ukernels that exist). */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b1121; \ + dim_t k_b11; \ + dim_t k_b21; \ + dim_t off_b11; \ + dim_t off_b21; \ + dim_t i, j, jb; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKNR + pd_a == NR + ps_a == stride to next micro-panel of A + rs_b == PACKMR + cs_b == 1 + pd_b == MR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + + Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the + swapping of values in the control tree (ie: those values used when + packing). This swapping is needed since we cast right-hand trsm in + terms of transposed left-hand trsm. So, if we're going to be + transposing the operation, then A needs to be packed with NR and B + needs to be packed with MR (remember: B is the triangular matrix in + the right-hand side parameter case). + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely above its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of NR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region above where the diagonal of B intersects + the left edge of the panel, adjust the pointer to A and treat this + case as if the diagonal offset were zero. Note that we don't need to + adjust the pointer to B since packm would have simply skipped over + the region that was not stored. */ \ + if ( diagoffb < 0 ) \ + { \ + j = -diagoffb; \ + k = k - j; \ + diagoffb = 0; \ + a_cast = a_cast + ( j * PACKMR ) / off_scl; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of B intersects the bottom of the panel, shrink it so that + we can index to the correct place in C (corresponding to the + part of the panel of B that was packed). + NOTE: This is NOT being done to skip over "no-op" iterations, + as with the trsm_lu macro-kernel. This MUST be done for correct + execution because we use n (via n_iter) to compute diagonal and + index offsets for backwards movement through B. */ \ + if ( diagoffb + k < n ) \ + { \ + n = diagoffb + k; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of NR. If k + isn't a multiple of NR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an NR x NR triangular solve. + This adjustment of k is consistent with what happened when B was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of A. */ \ + if ( k % NR != 0 ) k += NR - ( k % NR ); \ +\ + /* NOTE: We don't need to check that n is a multiple of PACKNR since we + know that the underlying buffer was already allocated to have an n + dimension that is a multiple of PACKNR, with the region between the + last column and the next multiple of NR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_schema_a( schema_b, &aux ); \ + bli_auxinfo_set_schema_b( schema_a, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( jb = 0; jb < n_iter; ++jb ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b11; \ + ctype* restrict b21; \ + ctype* restrict b2; \ +\ + j = n_iter - 1 - jb; \ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ + a1 = a_cast; \ + c11 = c1 + (n_iter-1)*cstep_c; \ +\ + n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of B resides below the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is above the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b11 = bli_max( -diagoffb_j, 0 ); \ + k_b1121 = k - off_b11; \ + k_b11 = NR; \ + k_b21 = k_b1121 - NR; \ + off_b21 = off_b11 + k_b11; \ +\ + /* Compute the addresses of the triangular block B11 and the + panel B21. */ \ + b11 = b1; \ + /* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \ + b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \ +\ + /* Compute the panel stride for the current micro-panel. */ \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a11; \ + ctype* restrict a12; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the A11 block and A12 panel. */ \ + a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ + a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + ps_b_cur; \ + if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b21, \ + alpha1_cast, \ + b21, \ + b11, \ + a12, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b21, \ + alpha1_cast, \ + b21, \ + b11, \ + a12, \ + a11, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + zero, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 -= cstep_c; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 ) + diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c new file mode 100644 index 000000000..289bb5d9f --- /dev/null +++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c @@ -0,0 +1,584 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffb, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha1, + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, + void* alpha2, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); + + +void bli_trsm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_alpha1; + void* buf_alpha2; + + FUNCPTR_T f; + + // Grab the address of the internal scalar buffer for the scalar + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( diagoffb, + schema_a, + schema_b, + m, + n, + k, + buf_alpha1, + buf_a, cs_a, pd_a, ps_a, + buf_b, rs_b, pd_b, ps_b, + buf_alpha2, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffb, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha1, \ + void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ + void* alpha2, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + const dim_t PACKMR = cs_a; \ + const dim_t PACKNR = rs_b; \ +\ + /* Cast the micro-kernel address to its function pointer type. */ \ + /* NOTE: We use the lower-triangular gemmtrsm ukernel because, while + the current macro-kernel targets the "ru" case (right-side/upper- + triangular), it becomes lower-triangular after the kernel operation + is transposed so that all kernel instances are of the "left" + variety (since those are the only trsm ukernels that exist). */ \ + PASTECH(ch,gemmtrsm_ukr_ft) \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffb_j; \ + dim_t k_full; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t k_b0111; \ + dim_t k_b01; \ + dim_t off_b01; \ + dim_t off_b11; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t istep_a; \ + inc_t istep_b; \ + inc_t off_scl; \ + inc_t ss_b_num; \ + inc_t ss_b_den; \ + inc_t ps_b_cur; \ + inc_t is_b_cur; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKNR + pd_a == NR + ps_a == stride to next micro-panel of A + rs_b == PACKMR + cs_b == 1 + pd_b == MR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + + Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the + swapping of values in the control tree (ie: those values used when + packing). This swapping is needed since we cast right-hand trsm in + terms of transposed left-hand trsm. So, if we're going to be + transposing the operation, then A needs to be packed with NR and B + needs to be packed with MR (remember: B is the triangular matrix in + the right-hand side parameter case). + */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of B is entirely below its diagonal, + it is implicitly zero. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute k_full as k inflated up to a multiple of NR. This is + needed because some parameter combinations of trsm reduce k + to advance past zero regions in the triangular matrix, and + when computing the imaginary stride of B (the non-triangular + matrix), which is used by 4m1/3m1 implementations, we need + this unreduced value of k. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ +\ + /* Compute indexing scaling factor for for 4m or 3m. This is + needed because one of the packing register blocksizes (PACKMR + or PACKNR) is used to index into the micro-panels of the non- + triangular matrix when computing with a diagonal-intersecting + micro-panel of the triangular matrix. In the case of 4m or 3m, + real values are stored in both sub-panels, and so the indexing + needs to occur in units of real values. The value computed + here is divided into the complex pointer offset to cause the + pointer to be advanced by the correct value. */ \ + if ( bli_is_4mi_packed( schema_b ) || \ + bli_is_3mi_packed( schema_b ) || \ + bli_is_rih_packed( schema_b ) ) off_scl = 2; \ + else off_scl = 1; \ +\ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ +\ + /* If there is a zero region to the left of where the diagonal of B + intersects the top edge of the panel, adjust the pointer to C and + treat this case as if the diagonal offset were zero. This skips over + the region that was not packed. (Note we assume the diagonal offset + is a multiple of MR; this assumption will hold as long as the cache + blocksizes are each a multiple of MR and NR.) */ \ + if ( diagoffb > 0 ) \ + { \ + j = diagoffb; \ + n = n - j; \ + diagoffb = 0; \ + c_cast = c_cast + (j )*cs_c; \ + } \ +\ + /* If there is a zero region below where the diagonal of B intersects the + right side of the block, shrink it to prevent "no-op" iterations from + executing. */ \ + if ( -diagoffb + n < k ) \ + { \ + k = -diagoffb + n; \ + } \ +\ + /* Check the k dimension, which needs to be a multiple of NR. If k + isn't a multiple of NR, we adjust it higher to satisfy the micro- + kernel, which is expecting to perform an NR x NR triangular solve. + This adjustment of k is consistent with what happened when B was + packed: all of its bottom/right edges were zero-padded, and + furthermore, the panel that stores the bottom-right corner of the + matrix has its diagonal extended into the zero-padded region (as + identity). This allows the trsm of that bottom-right panel to + proceed without producing any infs or NaNs that would infect the + "good" values of the corresponding block of A. */ \ + if ( k % NR != 0 ) k += NR - ( k % NR ); \ +\ + /* NOTE: We don't need to check that n is a multiple of PACKNR since we + know that the underlying buffer was already allocated to have an n + dimension that is a multiple of PACKNR, with the region between the + last column and the next multiple of NR zero-padded accordingly. */ \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + istep_a = PACKMR * k_full; \ + istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_schema_a( schema_b, &aux ); \ + bli_auxinfo_set_schema_b( schema_a, &aux ); \ +\ + /* Save the imaginary stride of A to the auxinfo_t object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b01; \ + ctype* restrict b11; \ + ctype* restrict b2; \ +\ + diagoffb_j = diagoffb - ( doff_t )j*NR; \ + a1 = a_cast; \ + c11 = c1; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* If the current panel of B intersects the diagonal, use a + special micro-kernel that performs a fused gemm and trsm. + If the current panel of B resides above the diagonal, use a + a regular gemm micro-kernel. Otherwise, if it is below the + diagonal, it was not packed (because it is implicitly zero) + and so we do nothing. */ \ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Determine the offset to and length of the panel that was packed + so we can index into the corresponding location in A. */ \ + off_b01 = 0; \ + k_b0111 = bli_min( k, -diagoffb_j + NR ); \ + k_b01 = k_b0111 - NR; \ + off_b11 = k_b01; \ +\ + /* Compute the addresses of the panel B10 and the triangular + block B11. */ \ + b01 = b1; \ + /* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \ + b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \ +\ + /* Compute the panel stride for the current micro-panel. */ \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ +\ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( is_b_cur, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a10; \ + ctype* restrict a11; \ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the A10 panel and A11 block. */ \ + a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \ + a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + ps_b_cur; \ + if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b01, \ + alpha1_cast, \ + b01, \ + b11, \ + a10, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the fused gemm/trsm micro-kernel. */ \ + gemmtrsm_ukr \ + ( \ + k_b01, \ + alpha1_cast, \ + b01, \ + b11, \ + a10, \ + a11, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Copy the result to the bottom edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += ps_b_cur; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ + { \ + /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t + object. + NOTE: We swap the values for A and B since the triangular + "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_is_a( istep_b, &aux ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + if( bli_trsm_my_iter( i, thread ) ){ \ +\ + ctype* restrict a2; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ + if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. NOTE: We swap the values for A and B since the + triangular "A" matrix is actually contained within B. */ \ + bli_auxinfo_set_next_a( b2, &aux ); \ + bli_auxinfo_set_next_b( a2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + minus_one, \ + b1, \ + a1, \ + zero, \ + ct, cs_ct, rs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + alpha2_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + b1 += cstep_b; \ + } \ +\ + c1 += cstep_c; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 ) + diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 344a07447..42ed83bc5 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -94,6 +95,60 @@ gint_t bli_info_get_enable_packbuf_pools( void ) return 0; #endif } +gint_t bli_info_get_enable_threading( void ) +{ + if ( bli_info_get_enable_openmp() || + bli_info_get_enable_pthreads() ) return 1; + else return 0; +} +gint_t bli_info_get_enable_openmp( void ) +{ +#ifdef BLIS_ENABLE_OPENMP + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_enable_pthreads( void ) +{ +#ifdef BLIS_ENABLE_PTHREADS + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_thread_part_jrir_slab( void ) +{ +#ifdef BLIS_ENABLE_JRIR_SLAB + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_thread_part_jrir_rr( void ) +{ +#ifdef BLIS_ENABLE_JRIR_RR + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_enable_memkind( void ) +{ +#ifdef BLIS_ENABLE_MEMKIND + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_enable_sandbox( void ) +{ +#ifdef BLIS_ENABLE_SANDBOX + return 1; +#else + return 0; +#endif +} diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 82ff86b03..96aeade85 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -58,6 +59,13 @@ gint_t bli_info_get_enable_blas( void ); gint_t bli_info_get_enable_cblas( void ); gint_t bli_info_get_blas_int_type_size( void ); gint_t bli_info_get_enable_packbuf_pools( void ); +gint_t bli_info_get_enable_threading( void ); +gint_t bli_info_get_enable_openmp( void ); +gint_t bli_info_get_enable_pthreads( void ); +gint_t bli_info_get_thread_part_jrir_slab( void ); +gint_t bli_info_get_thread_part_jrir_rr( void ); +gint_t bli_info_get_enable_memkind( void ); +gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c index 9b5803d9f..1f40933b0 100644 --- a/frame/base/bli_prune.c +++ b/frame/base/bli_prune.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, // matrix is empty. This is not strictly needed but rather a minor // optimization, as it would prevent threads that would otherwise get // subproblems on BLIS_ZEROS operands from calling the macro-kernel, - // because bli_thread_get_range*() would return empty ranges, which would + // because bli_thread_range*() would return empty ranges, which would // cause the variant's for loop from executing any iterations. // NOTE: this should only ever execute if the primary object is // triangular because that is the only structure type with subpartitions diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index eb92f08b0..4d235700f 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } +static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) +{ + return ( bool_t ) + ( bli_is_strictly_above_diag_n( diagoff, m, n ) || + bli_is_strictly_below_diag_n( diagoff, m, n ) ); +} + static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool_t ) @@ -784,10 +792,25 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) ( i != 0 || n_left == 0 ); } -static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth ) +static bool_t bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool_t ) - ( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) ); + ( i == end_iter - 1 ); +} + +static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +{ + return ( bool_t ) + ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); +} + +static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +{ +#ifdef BLIS_ENABLE_JRIR_SLAB + return bli_is_last_iter_sl( i, end_iter, tid, nth ); +#else // BLIS_ENABLE_JRIR_RR + return bli_is_last_iter_rr( i, end_iter, tid, nth ); +#endif } diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index bfe7e476f..3b1ef94ce 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -230,8 +230,52 @@ void bli_l3_thread_decorator _Pragma( "omp parallel num_threads(n_threads)" ) { + dim_t n_threads_real = omp_get_num_threads(); dim_t id = omp_get_thread_num(); + // Check if the number of OpenMP threads created within this parallel + // region is different from the number of threads that were requested + // of BLIS. This inequality may trigger when, for example, the + // following conditions are satisfied: + // - an application is executing an OpenMP parallel region in which + // BLIS is invokved, + // - BLIS is configured for multithreading via OpenMP, + // - OMP_NUM_THREADS = t > 1, + // - the number of threads requested of BLIS (regardless of method) + // is p <= t, + // - OpenMP nesting is disabled. + // In this situation, the application spawns t threads. Each application + // thread calls gemm (for example). Each gemm will attempt to spawn p + // threads via OpenMP. However, since nesting is disabled, the OpenMP + // implementation finds that t >= p threads are already spawned, and + // thus it doesn't spawn *any* additional threads for each gemm. + if ( n_threads_real != n_threads ) + { + // If the number of threads active in the current region is not + // equal to the number requested of BLIS, we then only continue + // if the number of threads in the current region is 1. If, for + // example, BLIS requested 4 threads but only got 3, then we + // abort(). + if ( id == 0 ) + { + if ( n_threads_real != 1 ) + { + bli_print_msg( "A different number of threads was " + "created than was requested.", + __FILE__, __LINE__ ); + bli_abort(); + } + + n_threads = 1; + bli_thrcomm_init( gl_comm, 1 ); + bli_rntm_set_num_threads_only( 1, rntm ); + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); + } + + // Synchronize all threads and continue. + _Pragma( "omp barrier" ) + } + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 2931d0951..8b9f41b75 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -61,7 +61,7 @@ void bli_thread_finalize( void ) // ----------------------------------------------------------------------------- -void bli_thread_get_range_sub +void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, @@ -72,6 +72,9 @@ void bli_thread_get_range_sub ) { dim_t n_way = bli_thread_n_way( thread ); + + if ( n_way == 1 ) { *start = 0; *end = n; return; } + dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; @@ -202,7 +205,7 @@ void bli_thread_get_range_sub } } -siz_t bli_thread_get_range_l2r +siz_t bli_thread_range_l2r ( thrinfo_t* thr, obj_t* a, @@ -216,13 +219,13 @@ siz_t bli_thread_get_range_l2r dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, n, bf, - FALSE, start, end ); + bli_thread_range_sub( thr, n, bf, + FALSE, start, end ); return m * ( *end - *start ); } -siz_t bli_thread_get_range_r2l +siz_t bli_thread_range_r2l ( thrinfo_t* thr, obj_t* a, @@ -236,13 +239,13 @@ siz_t bli_thread_get_range_r2l dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, n, bf, - TRUE, start, end ); + bli_thread_range_sub( thr, n, bf, + TRUE, start, end ); return m * ( *end - *start ); } -siz_t bli_thread_get_range_t2b +siz_t bli_thread_range_t2b ( thrinfo_t* thr, obj_t* a, @@ -256,13 +259,13 @@ siz_t bli_thread_get_range_t2b dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, m, bf, - FALSE, start, end ); + bli_thread_range_sub( thr, m, bf, + FALSE, start, end ); return n * ( *end - *start ); } -siz_t bli_thread_get_range_b2t +siz_t bli_thread_range_b2t ( thrinfo_t* thr, obj_t* a, @@ -276,15 +279,15 @@ siz_t bli_thread_get_range_b2t dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); - bli_thread_get_range_sub( thr, m, bf, - TRUE, start, end ); + bli_thread_range_sub( thr, m, bf, + TRUE, start, end ); return n * ( *end - *start ); } // ----------------------------------------------------------------------------- -dim_t bli_thread_get_range_width_l +dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, @@ -495,17 +498,17 @@ siz_t bli_find_area_trap_l // ----------------------------------------------------------------------------- -siz_t bli_thread_get_range_weighted_sub +siz_t bli_thread_range_weighted_sub ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr + thrinfo_t* restrict thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* restrict j_start_thr, + dim_t* restrict j_end_thr ) { dim_t n_way = bli_thread_n_way( thread ); @@ -570,7 +573,7 @@ siz_t bli_thread_get_range_weighted_sub // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = - bli_thread_get_range_width_l + bli_thread_range_width_l ( diagoff_j, m, n_left, j, n_way, @@ -614,7 +617,7 @@ siz_t bli_thread_get_range_weighted_sub bli_toggle_bool( &handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_get_range_weighted_sub + area = bli_thread_range_weighted_sub ( thread, diagoff, uplo, m, n, bf, handle_edge_low, @@ -632,7 +635,7 @@ siz_t bli_thread_get_range_weighted_sub return area; } -siz_t bli_thread_get_range_mdim +siz_t bli_thread_range_mdim ( dir_t direct, thrinfo_t* thr, @@ -678,20 +681,20 @@ siz_t bli_thread_get_range_mdim if ( use_weighted ) { if ( direct == BLIS_FWD ) - return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end ); + return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); else - return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end ); + return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) - return bli_thread_get_range_t2b( thr, x, bmult, start, end ); + return bli_thread_range_t2b( thr, x, bmult, start, end ); else - return bli_thread_get_range_b2t( thr, x, bmult, start, end ); + return bli_thread_range_b2t( thr, x, bmult, start, end ); } } -siz_t bli_thread_get_range_ndim +siz_t bli_thread_range_ndim ( dir_t direct, thrinfo_t* thr, @@ -737,20 +740,20 @@ siz_t bli_thread_get_range_ndim if ( use_weighted ) { if ( direct == BLIS_FWD ) - return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); + return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); else - return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); + return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) - return bli_thread_get_range_l2r( thr, x, bmult, start, end ); + return bli_thread_range_l2r( thr, x, bmult, start, end ); else - return bli_thread_get_range_r2l( thr, x, bmult, start, end ); + return bli_thread_range_r2l( thr, x, bmult, start, end ); } } -siz_t bli_thread_get_range_weighted_l2r +siz_t bli_thread_range_weighted_l2r ( thrinfo_t* thr, obj_t* a, @@ -782,7 +785,7 @@ siz_t bli_thread_get_range_weighted_l2r } area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end @@ -790,7 +793,7 @@ siz_t bli_thread_get_range_weighted_l2r } else // if dense or zeros { - area = bli_thread_get_range_l2r + area = bli_thread_range_l2r ( thr, a, bmult, start, end @@ -800,7 +803,7 @@ siz_t bli_thread_get_range_weighted_l2r return area; } -siz_t bli_thread_get_range_weighted_r2l +siz_t bli_thread_range_weighted_r2l ( thrinfo_t* thr, obj_t* a, @@ -834,7 +837,7 @@ siz_t bli_thread_get_range_weighted_r2l bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end @@ -842,7 +845,7 @@ siz_t bli_thread_get_range_weighted_r2l } else // if dense or zeros { - area = bli_thread_get_range_r2l + area = bli_thread_range_r2l ( thr, a, bmult, start, end @@ -852,7 +855,7 @@ siz_t bli_thread_get_range_weighted_r2l return area; } -siz_t bli_thread_get_range_weighted_t2b +siz_t bli_thread_range_weighted_t2b ( thrinfo_t* thr, obj_t* a, @@ -886,7 +889,7 @@ siz_t bli_thread_get_range_weighted_t2b bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); area = - bli_thread_get_range_weighted_sub + bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end @@ -894,7 +897,7 @@ siz_t bli_thread_get_range_weighted_t2b } else // if dense or zeros { - area = bli_thread_get_range_t2b + area = bli_thread_range_t2b ( thr, a, bmult, start, end @@ -904,7 +907,7 @@ siz_t bli_thread_get_range_weighted_t2b return area; } -siz_t bli_thread_get_range_weighted_b2t +siz_t bli_thread_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, @@ -939,7 +942,7 @@ siz_t bli_thread_get_range_weighted_b2t bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - area = bli_thread_get_range_weighted_sub + area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end @@ -947,7 +950,7 @@ siz_t bli_thread_get_range_weighted_b2t } else // if dense or zeros { - area = bli_thread_get_range_b2t + area = bli_thread_range_b2t ( thr, a, bmult, start, end diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 20c70a8f5..8dff32141 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -56,7 +57,8 @@ void bli_thread_finalize( void ); #endif // Thread range-related prototypes. -void bli_thread_get_range_sub + +void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, @@ -82,8 +84,8 @@ siz_t PASTEMAC0( opname ) \ dim_t* end \ ); -GENPROT( thread_get_range_mdim ) -GENPROT( thread_get_range_ndim ) +GENPROT( thread_range_mdim ) +GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ @@ -97,18 +99,18 @@ siz_t PASTEMAC0( opname ) \ dim_t* end \ ); -GENPROT( thread_get_range_l2r ) -GENPROT( thread_get_range_r2l ) -GENPROT( thread_get_range_t2b ) -GENPROT( thread_get_range_b2t ) +GENPROT( thread_range_l2r ) +GENPROT( thread_range_r2l ) +GENPROT( thread_range_t2b ) +GENPROT( thread_range_b2t ) -GENPROT( thread_get_range_weighted_l2r ) -GENPROT( thread_get_range_weighted_r2l ) -GENPROT( thread_get_range_weighted_t2b ) -GENPROT( thread_get_range_weighted_b2t ) +GENPROT( thread_range_weighted_l2r ) +GENPROT( thread_range_weighted_r2l ) +GENPROT( thread_range_weighted_t2b ) +GENPROT( thread_range_weighted_b2t ) -dim_t bli_thread_get_range_width_l +dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, @@ -126,17 +128,17 @@ siz_t bli_find_area_trap_l dim_t n, doff_t diagoff ); -siz_t bli_thread_get_range_weighted_sub +siz_t bli_thread_range_weighted_sub ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool_t handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr + thrinfo_t* restrict thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* restrict j_start_thr, + dim_t* restrict j_end_thr ); @@ -211,5 +213,98 @@ void bli_thread_init_rntm( rntm_t* rntm ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); +// ----------------------------------------------------------------------------- + +static void bli_thread_range_jrir_rr + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = bli_thread_n_way( thread ); + *end = n; +} + +static void bli_thread_range_jrir_sl + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use contiguous slab partitioning of jr/ir loops. + bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); + *inc = 1; +} + +static void bli_thread_range_jrir + ( + thrinfo_t* thread, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Define a general-purpose version of bli_thread_range_jrir() whose + // definition depends on whether slab or round-robin partitioning was + // requested at configure-time. +#ifdef BLIS_ENABLE_JRIR_SLAB + bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); +#else + bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); +#endif +} + +#if 0 +static void bli_thread_range_weighted_jrir + ( + thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool_t handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ +#ifdef BLIS_ENABLE_JRIR_SLAB + + // Use contiguous slab partitioning for jr/ir loops. + bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, + handle_edge_low, start, end ); + + *start = *start / bf; *inc = 1; + + if ( *end % bf ) *end = *end / bf + 1; + else *end = *end / bf; + +#else + + // Use interleaved partitioning of jr/ir loops. + *start = bli_thread_work_id( thread ); + *inc = bli_thread_n_way( thread ); + *end = n; + +#endif +} +#endif + #endif diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c index 3e8e930de..cd2d4bce8 100644 --- a/kernels/bgq/1/bli_dotv_bgq_int.c +++ b/kernels/bgq/1/bli_dotv_bgq_int.c @@ -34,8 +34,8 @@ #include "blis.h" -void bli_ddotv_bgq_int - ( +void bli_ddotv_bgq_int + ( conj_t conjx, conj_t conjy, dim_t n, @@ -44,14 +44,14 @@ void bli_ddotv_bgq_int double* restrict rho, cntx_t* restrict cntx ) -{ +{ bool_t use_ref = FALSE; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { - PASTEMAC(d,set0s)( rho ); - return; - } + PASTEMAC(d,set0s)( *rho ); + return; + } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )x, 32 ) || bli_is_unaligned_to( ( siz_t )y, 32 ) ) @@ -64,7 +64,7 @@ void bli_ddotv_bgq_int dim_t n_run = n / 4; dim_t n_left = n % 4; - + double rhos = 0.0; #pragma omp parallel reduction(+:rhos) { diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c index 95b5841e9..3b5cecd39 100644 --- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c +++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c @@ -39,15 +39,15 @@ /* - * Here is dgemm kernel for QPX. + * Here is dgemm kernel for QPX. * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread: * "Achievable peak can either be: * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread" * - * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. - * I know the kernel John uses is 8x8, so 16 flops per loop iteration. + * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. + * I know the kernel John uses is 8x8, so 16 flops per loop iteration. * Thus there must be 24 total instructions per iteration because 16/24 = 5.33. * * Here, we have 6 loads per iteration. These are executed on a different pipeline from FMAs so @@ -56,23 +56,16 @@ void bli_dgemm_bgq_int_8x8 ( - dim_t k0, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, + double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - //Registers for storing C. //4 4x4 subblocks of C, c00, c01, c10, c11 //4 registers per subblock: a, b, c, d @@ -110,7 +103,7 @@ void bli_dgemm_bgq_int_8x8 a0 = vec_lda ( 0 * sizeof(double), &a[8*i] ); a1 = vec_lda ( 4 * sizeof(double), &a[8*i] ); - + c00a = vec_xmadd ( b0a, a0, c00a ); c00b = vec_xxmadd( a0, b0a, c00b ); c00c = vec_xmadd ( b0b, a0, c00c ); @@ -131,7 +124,7 @@ void bli_dgemm_bgq_int_8x8 c11c = vec_xmadd ( b1b, a1, c11c ); c11d = vec_xxmadd( a1, b1b, c11d ); } - + // Create patterns for permuting Cb and Cd vector4double pattern = vec_gpci( 01032 ); @@ -140,7 +133,7 @@ void bli_dgemm_bgq_int_8x8 vector4double betav = vec_lds( 0, ( double* )beta ); vector4double alphav = vec_lds( 0, ( double* )alpha ); double ct; - + //Macro to update 4 elements of C in a column. //REG is the register holding those 4 elements //ADDR is the address to write them to @@ -167,7 +160,7 @@ void bli_dgemm_bgq_int_8x8 *(ADDR + (OFFSET + 2) * rs_c) = ct; \ ct = vec_extract( AB, 3 ); \ *(ADDR + (OFFSET + 3) * rs_c) = ct; \ -} +} //Update c00 and c10 sub-blocks UPDATE( c00a, c, 0 ); UPDATE( c10a, c, 4 ); @@ -263,7 +256,7 @@ void bli_zgemm_bgq_int_4x4 for( dim_t i = 0; i < k; i++ ) { - + b0 = vec_ld2a( 0 * sizeof(double), &b_d[8*i] ); b1 = vec_ld2a( 2 * sizeof(double), &b_d[8*i] ); b2 = vec_ld2a( 4 * sizeof(double), &b_d[8*i] ); @@ -271,7 +264,7 @@ void bli_zgemm_bgq_int_4x4 a0 = vec_lda ( 0 * sizeof(double), &a_d[8*i] ); a1 = vec_lda ( 4 * sizeof(double), &a_d[8*i] ); - + c00a = vec_xmadd ( b0, a0, c00a ); c00b = vec_xxcpnmadd( a0, b0, c00b ); c01a = vec_xmadd ( b1, a0, c01a ); @@ -308,7 +301,7 @@ void bli_zgemm_bgq_int_4x4 double alphai = bli_zimag( *alpha ); double betar = bli_zreal( *beta ); double betai = bli_zimag( *beta ); - vector4double alphav = vec_splats( 0.0 ); + vector4double alphav = vec_splats( 0.0 ); vector4double betav = vec_splats( 0.0 ); alphav = vec_insert( alphar, alphav, 0); alphav = vec_insert( alphai, alphav, 1); @@ -319,7 +312,7 @@ void bli_zgemm_bgq_int_4x4 betav = vec_insert( betar, betav, 2); betav = vec_insert( betai, betav, 3); double ct; - + //Macro to update 2 elements of C in a column. //REG1 is the register holding the first partial sum of those 2 elements diff --git a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c similarity index 99% rename from kernels/zen/3/bli_gemm_zen_asm_d6x8.c rename to kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 463155581..60073c501 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -77,7 +77,7 @@ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) -void bli_sgemm_zen_asm_6x16 +void bli_sgemm_haswell_asm_6x16 ( dim_t k0, float* restrict alpha, @@ -923,7 +923,7 @@ void bli_sgemm_zen_asm_6x16 vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ -void bli_dgemm_zen_asm_6x8 +void bli_dgemm_haswell_asm_6x8 ( dim_t k0, double* restrict alpha, @@ -1669,7 +1669,7 @@ void bli_dgemm_zen_asm_6x8 #define CGEMM_OUTPUT_RS \ vmovups(ymm0, mem(rcx)) \ -void bli_cgemm_zen_asm_3x8 +void bli_cgemm_haswell_asm_3x8 ( dim_t k0, scomplex* restrict alpha, @@ -2197,7 +2197,7 @@ void bli_cgemm_zen_asm_3x8 #define ZGEMM_OUTPUT_RS \ vmovupd(ymm0, mem(rcx)) \ -void bli_zgemm_zen_asm_3x4 +void bli_zgemm_haswell_asm_3x4 ( dim_t k0, dcomplex* restrict alpha, diff --git a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c similarity index 99% rename from kernels/zen/3/bli_gemm_zen_asm_d8x6.c rename to kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index 830cbec59..098d79d75 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -76,7 +76,7 @@ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) -void bli_sgemm_zen_asm_16x6 +void bli_sgemm_haswell_asm_16x6 ( dim_t k0, float* restrict alpha, @@ -662,7 +662,7 @@ void bli_sgemm_zen_asm_16x6 vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ -void bli_dgemm_zen_asm_8x6 +void bli_dgemm_haswell_asm_8x6 ( dim_t k0, double* restrict alpha, @@ -1257,7 +1257,7 @@ void bli_dgemm_zen_asm_8x6 #define CGEMM_OUTPUT_CS \ vmovups(ymm0, mem(rcx)) \ -void bli_cgemm_zen_asm_8x3 +void bli_cgemm_haswell_asm_8x3 ( dim_t k0, scomplex* restrict alpha, @@ -1785,7 +1785,7 @@ void bli_cgemm_zen_asm_8x3 #define ZGEMM_OUTPUT_CS \ vmovupd(ymm0, mem(rcx)) \ -void bli_zgemm_zen_asm_4x3 +void bli_zgemm_haswell_asm_4x3 ( dim_t k0, dcomplex* restrict alpha, diff --git a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c similarity index 99% rename from kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c rename to kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index 288dd2fed..1a2e4a012 100644 --- a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -56,7 +56,7 @@ vmovss(xmm1, mem(rcx, r10, 1)) -void bli_sgemmtrsm_l_zen_asm_6x16 +void bli_sgemmtrsm_l_haswell_asm_6x16 ( dim_t k0, float* restrict alpha, @@ -810,7 +810,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16 vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ -void bli_dgemmtrsm_l_zen_asm_6x8 +void bli_dgemmtrsm_l_haswell_asm_6x8 ( dim_t k0, double* restrict alpha, diff --git a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c similarity index 99% rename from kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c rename to kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index 748769cb3..2ac286e8d 100644 --- a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -56,7 +56,7 @@ vmovss(xmm1, mem(rcx, r10, 1)) -void bli_sgemmtrsm_u_zen_asm_6x16 +void bli_sgemmtrsm_u_haswell_asm_6x16 ( dim_t k0, float* restrict alpha, @@ -814,7 +814,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16 vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ -void bli_dgemmtrsm_u_zen_asm_6x8 +void bli_dgemmtrsm_u_haswell_asm_6x8 ( dim_t k0, double* restrict alpha, diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h index c8f0f1653..85670345d 100644 --- a/kernels/haswell/bli_kernels_haswell.h +++ b/kernels/haswell/bli_kernels_haswell.h @@ -32,23 +32,32 @@ */ -// d12x4 -GEMM_UKR_PROT( float, s, gemm_haswell_asm_24x4 ) -GEMM_UKR_PROT( double, d, gemm_haswell_asm_12x4 ) +// -- level-3 -- -// d4x12 -GEMM_UKR_PROT( float, s, gemm_haswell_asm_4x24 ) -GEMM_UKR_PROT( double, d, gemm_haswell_asm_4x12 ) - -// d6x8 +// gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) -// d8x6 +// gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) +// gemmtrsm_l (asm d6x8) +GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) +GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) + +// gemmtrsm_u (asm d6x8) +GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) +GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) + + +// gemm (asm d8x6) +//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) +//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) +//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) +//GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) + diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 119771436..842989a5a 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -76,32 +76,3 @@ AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) -// -- level-3 -- - -// gemm (asm d6x8) -GEMM_UKR_PROT( float, s, gemm_zen_asm_6x16 ) -GEMM_UKR_PROT( double, d, gemm_zen_asm_6x8 ) -GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_3x8 ) -GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_3x4 ) - -// gemm (asm d8x6) -GEMM_UKR_PROT( float, s, gemm_zen_asm_16x6 ) -GEMM_UKR_PROT( double, d, gemm_zen_asm_8x6 ) -GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 ) -GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 ) - -// gemmtrsm_l (asm d6x8) -GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_zen_asm_6x16 ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_6x8 ) - -// gemmtrsm_u (asm d6x8) -GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_zen_asm_6x16 ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_6x8 ) - - -// gemm (asm d8x6) -//GEMM_UKR_PROT( float, s, gemm_zen_asm_16x6 ) -//GEMM_UKR_PROT( double, d, gemm_zen_asm_8x6 ) -//GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 ) -//GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 ) - diff --git a/sandbox/ref99/blx_gemm_int.c b/sandbox/ref99/blx_gemm_int.c index 4937095a9..febb8040a 100644 --- a/sandbox/ref99/blx_gemm_int.c +++ b/sandbox/ref99/blx_gemm_int.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -46,10 +47,10 @@ void blx_gemm_int thrinfo_t* thread ) { - obj_t a_local; - obj_t b_local; - obj_t c_local; - gemm_voft f; + obj_t a_local; + obj_t b_local; + obj_t c_local; + gemm_var_oft f; // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); diff --git a/sandbox/ref99/cntl/blx_gemm_cntl.c b/sandbox/ref99/cntl/blx_gemm_cntl.c index ebcf6da30..d182296fa 100644 --- a/sandbox/ref99/cntl/blx_gemm_cntl.c +++ b/sandbox/ref99/cntl/blx_gemm_cntl.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -54,7 +55,14 @@ cntl_t* blx_gemmbp_cntl_create pack_t schema_b ) { - void* macro_kernel_p = blx_gemm_ker_var2; + void* macro_kernel_fp; + void* packa_fp; + void* packb_fp; + + macro_kernel_fp = blx_gemm_ker_var2; + + packa_fp = bli_packm_blk_var1; + packb_fp = bli_packm_blk_var1; // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node @@ -69,7 +77,7 @@ cntl_t* blx_gemmbp_cntl_create ( family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() - macro_kernel_p, + macro_kernel_fp, gemm_cntl_bu_ke ); @@ -77,7 +85,7 @@ cntl_t* blx_gemmbp_cntl_create cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node ( blx_gemm_packa, // pack the left-hand operand - bli_packm_blk_var1, + packa_fp, BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal @@ -101,7 +109,7 @@ cntl_t* blx_gemmbp_cntl_create cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node ( blx_gemm_packb, // pack the right-hand operand - bli_packm_blk_var1, + packb_fp, BLIS_KR, BLIS_NR, FALSE, // do NOT invert diagonal diff --git a/sandbox/ref99/vars/blx_gemm_blk_var1.c b/sandbox/ref99/vars/blx_gemm_blk_var1.c index 43eb40bef..70482ede1 100644 --- a/sandbox/ref99/vars/blx_gemm_blk_var1.c +++ b/sandbox/ref99/vars/blx_gemm_blk_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,7 +53,7 @@ void blx_gemm_blk_var1 dim_t my_start, my_end; // Determine the current thread's subpartition range. - bli_thread_get_range_mdim + bli_thread_range_mdim ( BLIS_FWD, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/sandbox/ref99/vars/blx_gemm_blk_var2.c b/sandbox/ref99/vars/blx_gemm_blk_var2.c index debcb2dfc..00a19ceef 100644 --- a/sandbox/ref99/vars/blx_gemm_blk_var2.c +++ b/sandbox/ref99/vars/blx_gemm_blk_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,7 +53,7 @@ void blx_gemm_blk_var2 dim_t my_start, my_end; // Determine the current thread's subpartition range. - bli_thread_get_range_ndim + bli_thread_range_ndim ( BLIS_FWD, thread, a, b, c, cntl, cntx, &my_start, &my_end diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c index c780489e9..21282a3f5 100644 --- a/sandbox/ref99/vars/blx_gemm_ker_var2.c +++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -256,16 +257,31 @@ void PASTECH2(blx_,ch,varname) \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ /* Save the desired output datatype (indicating no typecasting). */ \ - bli_auxinfo_set_dt_on_output( dt, &aux ); \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -280,7 +296,7 @@ void PASTECH2(blx_,ch,varname) \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ @@ -290,12 +306,12 @@ void PASTECH2(blx_,ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ - if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ - if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ diff --git a/sandbox/ref99/vars/blx_gemm_var.h b/sandbox/ref99/vars/blx_gemm_var.h index 22911eda2..b434ea60a 100644 --- a/sandbox/ref99/vars/blx_gemm_var.h +++ b/sandbox/ref99/vars/blx_gemm_var.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c b/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c new file mode 100644 index 000000000..eff1ecc85 --- /dev/null +++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2rr.c @@ -0,0 +1,373 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "blix.h" + +// Function pointer type for datatype-specific functions. +typedef void (*gemm_fp) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Function pointer array for datatype-specific functions. +static gemm_fp ftypes[BLIS_NUM_FP_TYPES] = +{ + PASTECH2(blx_,s,gemm_ker_var2rr), + PASTECH2(blx_,c,gemm_ker_var2rr), + PASTECH2(blx_,d,gemm_ker_var2rr), + PASTECH2(blx_,z,gemm_ker_var2rr) +}; + + +void blx_gemm_ker_var2rr + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + gemm_fp f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(blx_,ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +#if 0 +GENTFUNC( float, s, gemm_ker_var2rr ) +GENTFUNC( double, d, gemm_ker_var2rr ) +GENTFUNC( scomplex, c, gemm_ker_var2rr ) +GENTFUNC( dcomplex, z, gemm_ker_var2rr ) +#else +INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr ) +#endif + diff --git a/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c b/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c new file mode 100644 index 000000000..31f51df92 --- /dev/null +++ b/sandbox/ref99/vars/other/blx_gemm_ker_var2sl.c @@ -0,0 +1,373 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "blix.h" + +// Function pointer type for datatype-specific functions. +typedef void (*gemm_fp) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Function pointer array for datatype-specific functions. +static gemm_fp ftypes[BLIS_NUM_FP_TYPES] = +{ + PASTECH2(blx_,s,gemm_ker_var2sl), + PASTECH2(blx_,c,gemm_ker_var2sl), + PASTECH2(blx_,d,gemm_ker_var2sl), + PASTECH2(blx_,z,gemm_ker_var2sl) +}; + + +void blx_gemm_ker_var2sl + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + gemm_fp f; + + // Detach and multiply the scalars attached to A and B. + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(blx_,ch,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for each thrinfo_t node. */ \ + bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +#if 0 +GENTFUNC( float, s, gemm_ker_var2sl ) +GENTFUNC( double, d, gemm_ker_var2sl ) +GENTFUNC( scomplex, c, gemm_ker_var2sl ) +GENTFUNC( dcomplex, z, gemm_ker_var2sl ) +#else +INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl ) +#endif + diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 3dcd6d435..e91b100b2 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2018, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -200,13 +201,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=100 \ +PDEF_ST := -DP_BEGIN=96 \ -DP_END=2000 \ - -DP_INC=100 + -DP_INC=96 -PDEF_MT := -DP_BEGIN=200 \ - -DP_END=10000 \ - -DP_INC=200 +PDEF_MT := -DP_BEGIN=192 \ + -DP_END=3000 \ + -DP_INC=192 @@ -226,9 +227,6 @@ all-mt: blis-mt openblas-mt mkl-mt blis-st: blis-gemm-st blis-mt: blis-gemm-mt -blis-nat-st: blis-gemm-nat-st -blis-nat-mt: blis-gemm-nat-mt - openblas-st: openblas-gemm-st openblas-mt: openblas-gemm-mt @@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \ blis-gemm-mt: blis-gemm-nat-mt \ blis-gemm-ind-mt +blis-nat-st: \ + test_sgemm_asm_blis_st.x \ + test_dgemm_asm_blis_st.x \ + test_cgemm_asm_blis_st.x \ + test_zgemm_asm_blis_st.x \ + test_sherk_asm_blis_st.x \ + test_dherk_asm_blis_st.x \ + test_cherk_asm_blis_st.x \ + test_zherk_asm_blis_st.x \ + test_strmm_asm_blis_st.x \ + test_dtrmm_asm_blis_st.x \ + test_ctrmm_asm_blis_st.x \ + test_ztrmm_asm_blis_st.x \ + test_strsm_asm_blis_st.x \ + test_dtrsm_asm_blis_st.x \ + test_ctrsm_asm_blis_st.x \ + test_ztrsm_asm_blis_st.x + +blis-nat-mt: \ + test_sgemm_asm_blis_mt.x \ + test_dgemm_asm_blis_mt.x \ + test_cgemm_asm_blis_mt.x \ + test_zgemm_asm_blis_mt.x \ + test_sherk_asm_blis_mt.x \ + test_dherk_asm_blis_mt.x \ + test_cherk_asm_blis_mt.x \ + test_zherk_asm_blis_mt.x \ + test_strmm_asm_blis_mt.x \ + test_dtrmm_asm_blis_mt.x \ + test_ctrmm_asm_blis_mt.x \ + test_ztrmm_asm_blis_mt.x \ + test_strsm_asm_blis_mt.x \ + test_dtrsm_asm_blis_mt.x \ + test_ctrsm_asm_blis_mt.x \ + test_ztrsm_asm_blis_mt.x + blis-gemm-nat-st: \ test_sgemm_asm_blis_st.x \ test_dgemm_asm_blis_st.x \ @@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ # blis asm -test_d%_asm_blis_st.o: test_%.c +test_d%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_s%_asm_blis_st.o: test_%.c +test_s%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_z%_asm_blis_st.o: test_%.c +test_z%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_c%_asm_blis_st.o: test_%.c +test_c%_asm_blis_st.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ -test_d%_asm_blis_mt.o: test_%.c +test_d%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_s%_asm_blis_mt.o: test_%.c +test_s%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_z%_asm_blis_mt.o: test_%.c +test_z%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ -test_c%_asm_blis_mt.o: test_%.c +test_c%_asm_blis_mt.o: test_%.c Makefile $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@ # openblas diff --git a/test/3m4m/test_herk.c b/test/3m4m/test_herk.c new file mode 100644 index 000000000..66a057a59 --- /dev/null +++ b/test/3m4m/test_herk.c @@ -0,0 +1,314 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + ind_t ind; + num_t dt, dt_real; + char dt_ch; + int r, n_repeats; + uplo_t uploc; + trans_t transa; + f77_char f77_uploc; + f77_char f77_transa; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + dt_real = bli_dt_proj_to_real( DT ); + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + k_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + uploc = BLIS_LOWER; + transa = BLIS_NO_TRANSPOSE; + + bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt_real, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + if ( bli_does_trans( transa ) ) + bli_obj_create( dt, k, m, 0, 0, &a ); + else + bli_obj_create( dt, m, k, 0, 0, &a ); + bli_obj_create( dt, m, m, 0, 0, &c ); + //bli_obj_create( dt, m, k, 2, 2*m, &a ); + //bli_obj_create( dt, k, n, 2, 2*k, &b ); + //bli_obj_create( dt, m, n, 2, 2*m, &c ); + bli_obj_create( dt, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, &c ); + bli_obj_set_uplo( uploc, &c ); + + bli_obj_set_conjtrans( transa, &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_herk( &alpha, + &a, + &beta, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); + + ssyrk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); + + dsyrk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); + + cherk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); + + zherk_( &f77_uploc, + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/3m4m/test_trmm.c b/test/3m4m/test_trmm.c new file mode 100644 index 000000000..06ed38539 --- /dev/null +++ b/test/3m4m/test_trmm.c @@ -0,0 +1,328 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + side_t side; + uplo_t uploa; + trans_t transa; + diag_t diaga; + f77_char f77_side; + f77_char f77_uploa; + f77_char f77_transa; + f77_char f77_diaga; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + +#if 0 + side = BLIS_LEFT; +#else + side = BLIS_RIGHT; +#endif +#if 0 + uploa = BLIS_LOWER; +#else + uploa = BLIS_UPPER; +#endif + transa = BLIS_NO_TRANSPOSE; + diaga = BLIS_NONUNIT_DIAG; + + bli_param_map_blis_to_netlib_side( side, &f77_side ); + bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &a ); + else + bli_obj_create( dt, n, n, 0, 0, &a ); + bli_obj_create( dt, m, n, 0, 0, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); + bli_obj_set_uplo( uploa, &a ); + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_diag( diaga, &a ); + + bli_randm( &a ); + bli_mktrim( &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_trmm( side, + &alpha, + &a, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* cp = bli_obj_buffer( &c ); + + strmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* cp = bli_obj_buffer( &c ); + + dtrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* cp = bli_obj_buffer( &c ); + + ctrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* cp = bli_obj_buffer( &c ); + + ztrmm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/3m4m/test_trsm.c b/test/3m4m/test_trsm.c new file mode 100644 index 000000000..f417a5361 --- /dev/null +++ b/test/3m4m/test_trsm.c @@ -0,0 +1,338 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c, d; + obj_t c_save; + obj_t alpha; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + side_t side; + uplo_t uploa; + trans_t transa; + diag_t diaga; + f77_char f77_side; + f77_char f77_uploa; + f77_char f77_transa; + f77_char f77_diaga; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + +#if 0 + side = BLIS_LEFT; +#else + side = BLIS_RIGHT; +#endif +#if 0 + uploa = BLIS_LOWER; +#else + uploa = BLIS_UPPER; +#endif + transa = BLIS_NO_TRANSPOSE; + diaga = BLIS_NONUNIT_DIAG; + + bli_param_map_blis_to_netlib_side( side, &f77_side ); + bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &a ); + else + bli_obj_create( dt, n, n, 0, 0, &a ); + bli_obj_create( dt, m, n, 0, 0, &c ); + //bli_obj_create( dt, m, n, n, 1, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + + if ( bli_does_trans( side ) ) + bli_obj_create( dt, m, m, 0, 0, &d ); + else + bli_obj_create( dt, n, n, 0, 0, &d ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); + bli_obj_set_uplo( uploa, &a ); + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_diag( diaga, &a ); + + bli_randm( &a ); + bli_mktrim( &a ); + + bli_setd( &BLIS_TWO, &d ); + bli_addd( &d, &a ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_trsm( side, + &alpha, + &a, + &c ); + +#else + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* cp = bli_obj_buffer( &c ); + + strsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* cp = bli_obj_buffer( &c ); + + dtrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* cp = bli_obj_buffer( &c ); + + ctrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* cp = bli_obj_buffer( &c ); + + ztrsm_( &f77_side, + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR ); +#else + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + bli_obj_free( &d ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/thread_ranges/test_ranges.c b/test/thread_ranges/test_ranges.c index 68ffe7fec..9bf293ca5 100644 --- a/test/thread_ranges/test_ranges.c +++ b/test/thread_ranges/test_ranges.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -290,13 +291,13 @@ int main( int argc, char** argv ) thrinfo.work_id = t; if ( part_n_dim && go_fwd ) - area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end ); else if ( part_n_dim && go_bwd ) - area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end ); else if ( part_m_dim && go_fwd ) - area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end ); else // ( part_m_dim && go_bwd ) - area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end ); + area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end ); width = end - start; diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 230b65820..59911d4ed 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -752,19 +752,73 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) // If bli_info_get_int_type_size() returns 32 or 64, the size is forced. // Otherwise, the size is chosen automatically. We query the result of // that automatic choice via sizeof(gint_t). -/* - if ( bli_info_get_int_type_size() == 32 || - bli_info_get_int_type_size() == 64 ) - sprintf( int_type_size_str, "%d", ( int )bli_info_get_int_type_size() ); - else - sprintf( int_type_size_str, "%d", ( int )sizeof(gint_t) * 8 ); -*/ if ( bli_info_get_int_type_size() == 32 || bli_info_get_int_type_size() == 64 ) int_type_size = bli_info_get_int_type_size(); else int_type_size = sizeof(gint_t) * 8; + char impl_str[16]; + char jrir_str[16]; + + // Describe the threading implementation. + if ( bli_info_get_enable_openmp() ) sprintf( impl_str, "openmp" ); + else if ( bli_info_get_enable_pthreads() ) sprintf( impl_str, "pthreads" ); + else /* threading disabled */ sprintf( impl_str, "disabled" ); + + // Describe the status of jrir thread partitioning. + if ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" ); + else /*bli_info_get_thread_part_jrir_rr()*/ sprintf( jrir_str, "round-robin" ); + + char nt_str[16]; + char jc_nt_str[16]; + char pc_nt_str[16]; + char ic_nt_str[16]; + char jr_nt_str[16]; + char ir_nt_str[16]; + + // Query the number of ways of parallelism per loop (and overall) and + // convert these values into strings, with "unset" being used if the + // value returned was -1 (indicating the environment variable was unset). + dim_t nt = bli_thread_get_num_threads(); + dim_t jc_nt = bli_thread_get_jc_nt(); + dim_t pc_nt = bli_thread_get_pc_nt(); + dim_t ic_nt = bli_thread_get_ic_nt(); + dim_t jr_nt = bli_thread_get_jr_nt(); + dim_t ir_nt = bli_thread_get_ir_nt(); + + if ( nt == -1 ) sprintf( nt_str, "unset" ); + else sprintf( nt_str, "%d", ( int ) nt ); + if ( jc_nt == -1 ) sprintf( jc_nt_str, "unset" ); + else sprintf( jc_nt_str, "%d", ( int )jc_nt ); + if ( pc_nt == -1 ) sprintf( pc_nt_str, "unset" ); + else sprintf( pc_nt_str, "%d", ( int )pc_nt ); + if ( ic_nt == -1 ) sprintf( ic_nt_str, "unset" ); + else sprintf( ic_nt_str, "%d", ( int )ic_nt ); + if ( jr_nt == -1 ) sprintf( jr_nt_str, "unset" ); + else sprintf( jr_nt_str, "%d", ( int )jr_nt ); + if ( ir_nt == -1 ) sprintf( ir_nt_str, "unset" ); + else sprintf( ir_nt_str, "%d", ( int )ir_nt ); + + // Set up rntm_t objects for each of the four families: + // gemm, herk, trmm, trsm. + rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r; + dim_t m = 1000, n = 1000, k = 1000; + + bli_thread_init_rntm( &gemm ); + bli_thread_init_rntm( &herk ); + bli_thread_init_rntm( &trmm_l ); + bli_thread_init_rntm( &trmm_r ); + bli_thread_init_rntm( &trsm_l ); + bli_thread_init_rntm( &trsm_r ); + + bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT, m, n, k, &gemm ); + bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT, m, n, k, &herk ); + bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_LEFT, m, n, k, &trmm_l ); + bli_rntm_set_ways_for_op( BLIS_TRMM, BLIS_RIGHT, m, n, k, &trmm_r ); + bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_LEFT, m, n, k, &trsm_l ); + bli_rntm_set_ways_for_op( BLIS_TRSM, BLIS_RIGHT, m, n, k, &trsm_r ); + // Output some system parameters. libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS library info -------------------------------------\n" ); @@ -799,12 +853,62 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "CBLAS compatibility layer \n" ); libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_cblas() ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "libmemkind \n" ); + libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_memkind() ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "gemm sandbox \n" ); + libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_sandbox() ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); libblis_test_fprintf_c( os, " sizes (bytes) %7u %7u %7u %7u\n", sizeof(float), sizeof(double), sizeof(scomplex), sizeof(dcomplex) ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "--- BLIS parallelization info ---\n" ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "multithreading %s\n", impl_str ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "thread auto-factorization \n" ); + libblis_test_fprintf_c( os, " m dim thread ratio %d\n", ( int )BLIS_THREAD_RATIO_M ); + libblis_test_fprintf_c( os, " n dim thread ratio %d\n", ( int )BLIS_THREAD_RATIO_N ); + libblis_test_fprintf_c( os, " jr max threads %d\n", ( int )BLIS_THREAD_MAX_JR ); + libblis_test_fprintf_c( os, " ir max threads %d\n", ( int )BLIS_THREAD_MAX_IR ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "ways of parallelism nt jc pc ic jr ir\n" ); + libblis_test_fprintf_c( os, " environment %5s %5s %5s %5s %5s %5s\n", + nt_str, jc_nt_str, pc_nt_str, + ic_nt_str, jr_nt_str, ir_nt_str ); + libblis_test_fprintf_c( os, " gemm (m,n,k=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &gemm ), ( int )bli_rntm_pc_ways( &gemm ), + ( int )bli_rntm_ic_ways( &gemm ), + ( int )bli_rntm_jr_ways( &gemm ), ( int )bli_rntm_ir_ways( &gemm ) ); + libblis_test_fprintf_c( os, " herk (m,k=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &herk ), ( int )bli_rntm_pc_ways( &herk ), + ( int )bli_rntm_ic_ways( &herk ), + ( int )bli_rntm_jr_ways( &herk ), ( int )bli_rntm_ir_ways( &herk ) ); + libblis_test_fprintf_c( os, " trmm_l (m,n=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &trmm_l ), ( int )bli_rntm_pc_ways( &trmm_l ), + ( int )bli_rntm_ic_ways( &trmm_l ), + ( int )bli_rntm_jr_ways( &trmm_l ), ( int )bli_rntm_ir_ways( &trmm_l ) ); + libblis_test_fprintf_c( os, " trmm_r (m,n=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &trmm_r ), ( int )bli_rntm_pc_ways( &trmm_r ), + ( int )bli_rntm_ic_ways( &trmm_r ), + ( int )bli_rntm_jr_ways( &trmm_r ), ( int )bli_rntm_ir_ways( &trmm_r ) ); + libblis_test_fprintf_c( os, " trsm_l (m,n=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &trsm_l ), ( int )bli_rntm_pc_ways( &trsm_l ), + ( int )bli_rntm_ic_ways( &trsm_l ), + ( int )bli_rntm_jr_ways( &trsm_l ), ( int )bli_rntm_ir_ways( &trsm_l ) ); + libblis_test_fprintf_c( os, " trsm_r (m,n=1000) %5d %5d %5d %5d %5d\n", + ( int )bli_rntm_jc_ways( &trsm_r ), ( int )bli_rntm_pc_ways( &trsm_r ), + ( int )bli_rntm_ic_ways( &trsm_r ), + ( int )bli_rntm_jr_ways( &trsm_r ), ( int )bli_rntm_ir_ways( &trsm_r ) ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "thread partitioning \n" ); + //libblis_test_fprintf_c( os, " jc/ic loops %s\n", "slab" ); + libblis_test_fprintf_c( os, " jr/ir loops %s\n", jrir_str ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS default implementations ---\n" ); diff --git a/windows/build/libblis-symbols.def b/windows/build/libblis-symbols.def index 983292b05..13ae1c60c 100644 --- a/windows/build/libblis-symbols.def +++ b/windows/build/libblis-symbols.def @@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt bli_thread_get_jr_nt bli_thread_get_num_threads bli_thread_get_pc_nt -bli_thread_get_range_b2t -bli_thread_get_range_l2r -bli_thread_get_range_mdim -bli_thread_get_range_ndim -bli_thread_get_range_r2l -bli_thread_get_range_sub -bli_thread_get_range_t2b -bli_thread_get_range_weighted_b2t -bli_thread_get_range_weighted_l2r -bli_thread_get_range_weighted_r2l -bli_thread_get_range_weighted_sub -bli_thread_get_range_weighted_t2b -bli_thread_get_range_width_l +bli_thread_range_b2t +bli_thread_range_l2r +bli_thread_range_mdim +bli_thread_range_ndim +bli_thread_range_r2l +bli_thread_range_sub +bli_thread_range_t2b +bli_thread_range_weighted_b2t +bli_thread_range_weighted_l2r +bli_thread_range_weighted_r2l +bli_thread_range_weighted_sub +bli_thread_range_weighted_t2b +bli_thread_range_width_l bli_thread_init bli_thread_init_rntm bli_thread_init_rntm_from_env