From f63f78d783accc5a86b3e6469e647698518e4e2c Mon Sep 17 00:00:00 2001 From: Dipal M Zambare Date: Mon, 20 Dec 2021 09:43:13 +0530 Subject: [PATCH] Removed Arch specific code from BLIS framework. - Removed BLIS_CONFIG_EPYC macro - The code dependent on this macro is handled in one of the three ways -- It is updated to work across platforms. -- Added in architecture/feature specific runtime checks. -- Duplicated in AMD specific files. Build system is updated to pick AMD specific files when library is built for any of the zen architecture AMD-Internal: [CPUPL-1960] Change-Id: I6f9f8018e41fa48eb43ae4245c9c2c361857f43b --- Makefile | 24 +- build/config.mk.in | 4 +- config/amdzen/make_defs.mk | 12 +- config/zen/make_defs.mk | 19 +- config/zen2/make_defs.mk | 16 +- config/zen3/make_defs.mk | 16 +- config/zen4/make_defs.mk | 16 +- configure | 3 +- frame/2/gemv/bli_gemv_unf_var1.c | 356 +----- frame/2/gemv/bli_gemv_unf_var1_amd.c | 440 ++++++++ frame/2/gemv/bli_gemv_unf_var2.c | 764 +------------ frame/2/gemv/bli_gemv_unf_var2_amd.c | 879 +++++++++++++++ frame/2/hemv/bli_hemv_unf_var1.c | 204 +--- frame/2/hemv/bli_hemv_unf_var1_amd.c | 418 +++++++ frame/2/hemv/bli_hemv_unf_var3.c | 208 +--- frame/2/hemv/bli_hemv_unf_var3_amd.c | 420 +++++++ frame/2/her2/bli_her2_unf_var1.c | 212 ---- frame/2/her2/bli_her2_unf_var1_amd.c | 369 ++++++ frame/2/her2/bli_her2_unf_var4.c | 187 ---- frame/2/her2/bli_her2_unf_var4_amd.c | 354 ++++++ frame/2/trsv/bli_trsv_unf_var1.c | 419 +------ frame/2/trsv/bli_trsv_unf_var1_amd.c | 638 +++++++++++ frame/2/trsv/bli_trsv_unf_var2.c | 804 +------------- frame/2/trsv/bli_trsv_unf_var2_amd.c | 1024 +++++++++++++++++ frame/3/bli_l3_sup_int.c | 128 +-- frame/3/bli_l3_sup_int_amd.c | 352 ++++++ frame/3/gemm/bli_gemm_front.c | 13 - frame/3/gemm/bli_gemm_front_amd.c | 413 +++++++ frame/base/bli_cpuid.c | 19 + frame/base/bli_cpuid.h | 4 +- frame/compat/bla_amax.c | 214 +--- frame/compat/bla_amax_amd.c | 295 +++++ frame/compat/bla_axpy.c | 407 +------ frame/compat/bla_axpy_amd.c | 462 ++++++++ frame/compat/bla_copy.c | 214 +--- frame/compat/bla_copy_amd.c | 285 +++++ frame/compat/bla_dot.c | 678 +---------- frame/compat/bla_dot_amd.c | 841 ++++++++++++++ frame/compat/bla_gemm.c | 507 +-------- frame/compat/bla_gemm_amd.c | 894 +++++++++++++++ frame/compat/bla_gemv.c | 853 +------------- frame/compat/bla_gemv_amd.c | 963 ++++++++++++++++ frame/compat/bla_scal.c | 176 +-- frame/compat/bla_scal_amd.c | 260 +++++ frame/compat/bla_swap.c | 195 +--- frame/compat/bla_swap_amd.c | 268 +++++ frame/compat/bla_trsm.c | 1172 +------------------ frame/compat/bla_trsm_amd.c | 1544 ++++++++++++++++++++++++++ kernels/zen/1/bli_scalv_zen_int10.c | 28 +- kernels/zen/1f/bli_axpyf_zen_int_4.c | 49 +- kernels/zen/1f/bli_axpyf_zen_int_5.c | 173 +-- kernels/zen/1f/bli_axpyf_zen_int_6.c | 26 +- kernels/zen/3/bli_gemm_small.c | 15 +- 53 files changed, 11226 insertions(+), 8028 deletions(-) create mode 100644 frame/2/gemv/bli_gemv_unf_var1_amd.c create mode 100644 frame/2/gemv/bli_gemv_unf_var2_amd.c create mode 100644 frame/2/hemv/bli_hemv_unf_var1_amd.c create mode 100644 frame/2/hemv/bli_hemv_unf_var3_amd.c create mode 100644 frame/2/her2/bli_her2_unf_var1_amd.c create mode 100644 frame/2/her2/bli_her2_unf_var4_amd.c create mode 100644 frame/2/trsv/bli_trsv_unf_var1_amd.c create mode 100644 frame/2/trsv/bli_trsv_unf_var2_amd.c create mode 100644 frame/3/bli_l3_sup_int_amd.c create mode 100644 frame/3/gemm/bli_gemm_front_amd.c create mode 100644 frame/compat/bla_amax_amd.c create mode 100644 frame/compat/bla_axpy_amd.c create mode 100644 frame/compat/bla_copy_amd.c create mode 100644 frame/compat/bla_dot_amd.c create mode 100644 frame/compat/bla_gemm_amd.c create mode 100644 frame/compat/bla_gemv_amd.c create mode 100644 frame/compat/bla_scal_amd.c create mode 100644 frame/compat/bla_swap_amd.c create mode 100644 frame/compat/bla_trsm_amd.c diff --git a/Makefile b/Makefile index b248d5781..1658e16de 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -212,6 +212,27 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \ # Generate object file paths for all of the portable framework source code. MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) +# AMD has optimized some of the framework files, these optimizations +# may not be compatible with other platforms. +# +# In order to keep main framework code independent of AMD changes, +# AMD has duplicated the files and updated them for example +# frame/compact/bla_gemm.c : generic framework file +# frame/compact/bla_gemm_amd.c : AMD optimized framework file +# Based on the archiecture we choose correct files + +ifeq ($(MK_IS_ARCH_ZEN),yes) +# Build is being done for AMD platforms, remove the objects which +# don't have amd suffix (for which exists AMD specific implementation). +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS)) +MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS)) +else +# Build is done for non AMD platforms, remove the amd specific objects +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS)) +endif + # Generate object file paths for all of the debgu and trace logger. MK_AOCLDTL_OBJS := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH)) @@ -1338,4 +1359,3 @@ else @echo "Uninstalling $(@F) from $(@D)/" @- $(RM_F) $@ endif - diff --git a/build/config.mk.in b/build/config.mk.in index 709e0f543..a880074e8 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -204,5 +204,7 @@ MK_ENABLE_AOCL_DYNAMIC := @enable_aocl_dynamic@ # BLAS int size MK_BLAS_INT_TYPE_SIZE := @blas_int_type_size@ +MK_IS_ARCH_ZEN := @enable_aocl_zen@ + # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/config/amdzen/make_defs.mk b/config/amdzen/make_defs.mk index 7697e9ff0..e46746160 100644 --- a/config/amdzen/make_defs.mk +++ b/config/amdzen/make_defs.mk @@ -4,7 +4,7 @@ # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -49,16 +49,6 @@ else COPTFLAGS := -O3 endif -# This will add BLIS_CONFIG_EPYC for all framework files -# FIXME: framework files should not have architecture specific -# checks at least at compile time. Once the macro -# is defined it is applicable to every build in the -# Family including any non AMD configuration. -# However, it is still better to define it in makefiles -# instead of headers so we can have slighly more -# control on this. -COPTFLAGS += -DBLIS_CONFIG_EPYC - # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index be1086a1d..08d8628be 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -46,25 +46,12 @@ AMD_CONFIG_FILE := amd_config.mk AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen -include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) - -# Since we removed BLIS_CONFIG_EPYC from header file, we need to -# add it here at two places, -# CPPROCFLAGS = This will enable it for framework code -# This flag is used when configure is invoked with specific architecture -# CKOPTFLAGS = This will enable it for architecture specific kernels -# This flag is used for kernels assocaited with this architecture -# irrespective of the configuration it is built for. - -CPPROCFLAGS := -DBLIS_CONFIG_EPYC - - ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif - # # --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] ----------------------- # @@ -86,10 +73,6 @@ else CRVECFLAGS := $(CKVECFLAGS) endif -# Add this after updating variables for reference kernels -# we don't want this defined for them -CKOPTFLAGS += -DBLIS_CONFIG_EPYC - # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk index ba91f722a..3b87d35b0 100644 --- a/config/zen2/make_defs.mk +++ b/config/zen2/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -50,15 +50,7 @@ THIS_CONFIG := zen2 # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. -# Since we removed BLIS_CONFIG_EPYC from header file, we need to -# add it here at two places, -# CPPROCFLAGS = This will enable it for framework code -# This flag is used when configure is invoked with specific architecture -# CKOPTFLAGS = This will enable it for architecture specific kernels -# This flag is used for kernels assocaited with this architecture -# irrespective of the configuration it is built for. - -CPPROCFLAGS := -DBLIS_CONFIG_EPYC +CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := @@ -111,10 +103,6 @@ endif CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) -# Add this after updating variables for reference kernels -# we don't want this defined for them -CKOPTFLAGS += -DBLIS_CONFIG_EPYC - # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk index a479acf8a..8522a1e95 100644 --- a/config/zen3/make_defs.mk +++ b/config/zen3/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -50,15 +50,7 @@ THIS_CONFIG := zen3 # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. -# Since we removed BLIS_CONFIG_EPYC from header file, we need to -# add it here at two places, -# CPPROCFLAGS = This will enable it for framework code -# This flag is used when configure is invoked with specific architecture -# CKOPTFLAGS = This will enable it for architecture specific kernels -# This flag is used for kernels assocaited with this architecture -# irrespective of the configuration it is built for. - -CPPROCFLAGS := -DBLIS_CONFIG_EPYC +CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := @@ -132,10 +124,6 @@ endif # gcc CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) -# Add this after updating variables for reference kernels -# we don't want this defined for them -CKOPTFLAGS += -DBLIS_CONFIG_EPYC - # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index 352bd29c4..44e96bb0c 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -4,7 +4,7 @@ # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -49,15 +49,7 @@ THIS_CONFIG := zen4 # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. -# Since we removed BLIS_CONFIG_EPYC from header file, we need to -# add it here at two places, -# CPPROCFLAGS = This will enable it for framework code -# This flag is used when configure is invoked with specific architecture -# CKOPTFLAGS = This will enable it for architecture specific kernels -# This flag is used for kernels assocaited with this architecture -# irrespective of the configuration it is built for. - -CPPROCFLAGS := -DBLIS_CONFIG_EPYC +CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := @@ -131,10 +123,6 @@ endif # gcc CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) -# Add this after updating variables for reference kernels -# we don't want this defined for them -CKOPTFLAGS += -DBLIS_CONFIG_EPYC - # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) diff --git a/configure b/configure index bec498d3c..f49ea19e5 100755 --- a/configure +++ b/configure @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -3370,6 +3370,7 @@ main() | sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \ | sed -e "s/@complex_return@/${complex_return}/g" \ | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ + | sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \ > "${config_mk_out_path}" diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index 838ea577b..8162613c1 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -104,357 +104,5 @@ void PASTEMAC(ch,varname) \ } \ } -#ifdef BLIS_CONFIG_EPYC -void bli_dgemv_unf_var1 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - double* beta, - double* y, inc_t incy, - cntx_t* cntx - ) -{ - - double *A1; - double *y1; - dim_t i; - dim_t f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - //memory pool declarations for packing vector X. - mem_t mem_bufX; - rntm_t rntm; - double *x_buf = x; - inc_t buf_incx = incx; - - bli_init_once(); - - if (cntx == NULL) - cntx = bli_gks_query_cntx(); - - bli_set_dims_incs_with_trans(transa, - m, n, rs_a, cs_a, - &n_iter, &n_elem, &rs_at, &cs_at); - - conja = bli_extract_conj(transa); - - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(d,type); - double* x1; - double* y1; - PASTECH(d,dotxf_ker_ft) kfp_df; - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); - dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; - - /* y1 = beta * y1 + alpha * A1 * x; */ - kfp_df - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); - - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - if (incx > 1) - { - /* - Initialize mem pool buffer to NULL and size to 0 - "buf" and "size" fields are assigned once memory - is allocated from the pool in bli_membrk_acquire_m(). - This will ensure bli_mem_is_alloc() will be passed on - an allocated memory if created or a NULL . - */ - - mem_bufX.pblk.buf = NULL; - mem_bufX.pblk.block_size = 0; - mem_bufX.buf_type = 0; - mem_bufX.size = 0; - mem_bufX.pool = NULL; - - /* In order to get the buffer from pool via rntm access to memory broker - is needed.Following are initializations for rntm */ - - bli_rntm_init_from_global(&rntm); - bli_rntm_set_num_threads_only(1, &rntm); - bli_membrk_rntm_set_membrk(&rntm); - - //calculate the size required for n_elem double elements in vector X. - size_t buffer_size = n_elem * sizeof(double); - -#ifdef BLIS_ENABLE_MEM_TRACING - printf("bli_dgemv_unf_var1(): get mem pool block\n"); -#endif - - /*acquire a Buffer(n_elem*size(double)) from the memory broker - and save the associated mem_t entry to mem_bufX.*/ - bli_membrk_acquire_m(&rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufX); - - /*Continue packing X if buffer memory is allocated*/ - if ((bli_mem_is_alloc(&mem_bufX))) - { - x_buf = bli_mem_buffer(&mem_bufX); - - //pack X vector with non-unit stride to a temp buffer x_buf with unit stride - for (dim_t x_index = 0; x_index < n_elem; x_index++) - { - *(x_buf + x_index) = *(x + (x_index * incx)); - } - // stride of vector x_buf =1 - buf_incx = 1; - } - } - - dim_t fuse_factor = 8; - dim_t f_temp =0; - - if (n < 4) - { - fuse_factor = 2; - } else if (n < 8) - { - fuse_factor = 4; - } - - - for (i = 0; i < n_iter; i += f) - { - f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor); - - //A = a + i * row_increment + 0 * column_increment - A1 = a + (i)*rs_at; - y1 = y + (i)*incy; - - /* y1 = beta * y1 + alpha * A1 * x; */ - switch (f) - { - case 8: - - bli_ddotxf_zen_int_8( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - - break; - default: - - if (f < 4) - { - bli_ddotxf_zen_int_2( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - } - else - { - bli_ddotxf_zen_int_4( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x_buf, buf_incx, - beta, - y1, incy, - cntx); - } - } - - f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor); - - if (f_temp < fuse_factor) - { - switch (fuse_factor) - { - case 8: - fuse_factor = 4; - break; - case 4: - fuse_factor = 2; - break; - } - } - } - - if ((incx > 1) && bli_mem_is_alloc(&mem_bufX)) - { -#ifdef BLIS_ENABLE_MEM_TRACING - printf("bli_dgemv_unf_var1(): releasing mem pool block\n"); -#endif - // Return the buffer to pool - bli_membrk_release(&rntm, &mem_bufX); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); -} - -void bli_sgemv_unf_var1 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - float* alpha, - float* a, inc_t rs_a, inc_t cs_a, - float* x, inc_t incx, - float* beta, - float* y, inc_t incy, - cntx_t* cntx - ) -{ - - float* A1; - float* x1; - float* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - - bli_init_once(); - - if( cntx == NULL ) cntx = bli_gks_query_cntx(); - - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_iter, &n_elem, &rs_at, &cs_at ); - - conja = bli_extract_conj( transa ); - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(s,type); - float* x1 ; - PASTECH(s,dotxf_ker_ft) kfp_df; - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; - - /* y1 = beta * y1 + alpha * A1 * x; */ - kfp_df - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); - - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - /* Query the context for the kernel function pointer and fusing factor. */ - b_fuse = 8; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; - - /* y1 = beta * y1 + alpha * A1 * x; */ - bli_sdotxf_zen_int_8 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); - - } -} - -INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 ) -#else INSERT_GENTFUNC_BASIC0( gemv_unf_var1 ) -#endif + diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c new file mode 100644 index 000000000..7228c12f7 --- /dev/null +++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c @@ -0,0 +1,440 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ +\ + if(cntx == NULL) cntx = bli_gks_query_cntx(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* A1; \ + ctype* x1; \ + ctype* y1; \ + dim_t i; \ + dim_t b_fuse, f; \ + dim_t n_elem, n_iter; \ + inc_t rs_at, cs_at; \ + conj_t conja; \ +\ + bli_set_dims_incs_with_trans( transa, \ + m, n, rs_a, cs_a, \ + &n_iter, &n_elem, &rs_at, &cs_at ); \ +\ + conja = bli_extract_conj( transa ); \ +\ + PASTECH(ch,dotxf_ker_ft) kfp_df; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ +\ + for ( i = 0; i < n_iter; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ +\ + A1 = a + (i )*rs_at + (0 )*cs_at; \ + x1 = x + (0 )*incy; \ + y1 = y + (i )*incy; \ +\ + /* y1 = beta * y1 + alpha * A1 * x; */ \ + kfp_df \ + ( \ + conja, \ + conjx, \ + n_elem, \ + f, \ + alpha, \ + A1, cs_at, rs_at, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ +\ + } \ +} + +void bli_dgemv_unf_var1 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + + double *A1; + double *y1; + dim_t i; + dim_t f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + //memory pool declarations for packing vector X. + mem_t mem_bufX; + rntm_t rntm; + double *x_buf = x; + inc_t buf_incx = incx; + + bli_init_once(); + + if (cntx == NULL) + cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans(transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at); + + conja = bli_extract_conj(transa); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + const num_t dt = PASTEMAC(d,type); + double* x1; + double* y1; + PASTECH(d,dotxf_ker_ft) kfp_df; + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); + dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + kfp_df + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); + + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + if (incx > 1) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + + mem_bufX.pblk.buf = NULL; + mem_bufX.pblk.block_size = 0; + mem_bufX.buf_type = 0; + mem_bufX.size = 0; + mem_bufX.pool = NULL; + + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ + + bli_rntm_init_from_global(&rntm); + bli_rntm_set_num_threads_only(1, &rntm); + bli_membrk_rntm_set_membrk(&rntm); + + //calculate the size required for n_elem double elements in vector X. + size_t buffer_size = n_elem * sizeof(double); + +#ifdef BLIS_ENABLE_MEM_TRACING + printf("bli_dgemv_unf_var1(): get mem pool block\n"); +#endif + + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufX); + + /*Continue packing X if buffer memory is allocated*/ + if ((bli_mem_is_alloc(&mem_bufX))) + { + x_buf = bli_mem_buffer(&mem_bufX); + + //pack X vector with non-unit stride to a temp buffer x_buf with unit stride + for (dim_t x_index = 0; x_index < n_elem; x_index++) + { + *(x_buf + x_index) = *(x + (x_index * incx)); + } + // stride of vector x_buf =1 + buf_incx = 1; + } + } + + dim_t fuse_factor = 8; + dim_t f_temp =0; + + if (n < 4) + { + fuse_factor = 2; + } else if (n < 8) + { + fuse_factor = 4; + } + + for (i = 0; i < n_iter; i += f) + { + f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor); + + //A = a + i * row_increment + 0 * column_increment + A1 = a + (i)*rs_at; + y1 = y + (i)*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + switch (f) + { + case 8: + + bli_ddotxf_zen_int_8( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + + break; + default: + + if (f < 4) + { + bli_ddotxf_zen_int_2( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + } + else + { + bli_ddotxf_zen_int_4( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx); + } + } + + f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor); + + if (f_temp < fuse_factor) + { + switch (fuse_factor) + { + case 8: + fuse_factor = 4; + break; + case 4: + fuse_factor = 2; + break; + } + } + } + + if ((incx > 1) && bli_mem_is_alloc(&mem_bufX)) + { +#ifdef BLIS_ENABLE_MEM_TRACING + printf("bli_dgemv_unf_var1(): releasing mem pool block\n"); +#endif + // Return the buffer to pool + bli_membrk_release(&rntm, &mem_bufX); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + +void bli_sgemv_unf_var1 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + float* alpha, + float* a, inc_t rs_a, inc_t cs_a, + float* x, inc_t incx, + float* beta, + float* y, inc_t incy, + cntx_t* cntx + ) +{ + + float* A1; + float* x1; + float* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + bli_init_once(); + + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + const num_t dt = PASTEMAC(s,type); + float* x1 ; + PASTECH(s,dotxf_ker_ft) kfp_df; + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + kfp_df + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); + + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + /* Query the context for the kernel function pointer and fusing factor. */ + b_fuse = 8; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + bli_sdotxf_zen_int_8 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); + + } +} + +INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 ) + diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index 093b615a7..227e43ad0 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -137,764 +137,4 @@ void PASTEMAC(ch,varname) \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \ } -#ifdef BLIS_CONFIG_EPYC - -void bli_dgemv_unf_var2 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - double* beta, - double* y, inc_t incy, - cntx_t* cntx - ) -{ - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - double* A1; - double* x1; - dim_t i; - dim_t f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - //memory pool declarations for packing vector Y. - mem_t mem_bufY; - rntm_t rntm; - double *y_buf = y; - inc_t buf_incy = incy; - - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_elem, &n_iter, &rs_at, &cs_at ); - - conja = bli_extract_conj( transa ); - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(d,type); - double* x1; - double* y1; - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(d,eq0)( *beta ) ) - { - double* zero = PASTEMAC(d,0); - /* y = 0; */ - PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(d,axpyf_ker_ft) kfp_af; - - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ - /* beta=0 case is hadled by scalv internally */ - - bli_dscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - NULL - ); - - if( bli_deq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; - } - - if (incy > 1) - { - /* - Initialize mem pool buffer to NULL and size to 0 - "buf" and "size" fields are assigned once memory - is allocated from the pool in bli_membrk_acquire_m(). - This will ensure bli_mem_is_alloc() will be passed on - an allocated memory if created or a NULL . - */ - mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0; - mem_bufY.buf_type = 0; mem_bufY.size = 0; - mem_bufY.pool = NULL; - - /* In order to get the buffer from pool via rntm access to memory broker - is needed.Following are initializations for rntm */ - - bli_rntm_init_from_global( &rntm ); - bli_rntm_set_num_threads_only( 1, &rntm ); - bli_membrk_rntm_set_membrk( &rntm ); - - //calculate the size required for n_elem double elements in vector Y. - size_t buffer_size = n_elem * sizeof(double); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dgemv_unf_var2(): get mem pool block\n" ); - #endif - - /*acquire a Buffer(n_elem*size(double)) from the memory broker - and save the associated mem_t entry to mem_bufY.*/ - bli_membrk_acquire_m(&rntm, - buffer_size, - BLIS_BUFFER_FOR_B_PANEL, - &mem_bufY); - - /*Continue packing Y if buffer memory is allocated*/ - if ((bli_mem_is_alloc( &mem_bufY ))) - { - y_buf = bli_mem_buffer(&mem_bufY); - - //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride - for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) - { - *(y_buf + y_index) = *(y + (y_index * incy)) ; - } - // stride of vector y_buf =1 - buf_incy = 1; - } - } - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - - /* y = y + alpha * A1 * x1; */ - bli_daxpyf_zen_int_16x4 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y_buf, buf_incy, - NULL - ); - } - if ((incy > 1) && bli_mem_is_alloc( &mem_bufY )) - { - //store the result from unit strided y_buf to non-unit strided Y - for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) - { - *(y + (y_index * incy)) = *(y_buf + y_index) ; - } - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" ); - #endif - // Return the buffer to pool - bli_membrk_release(&rntm , &mem_bufY); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); -} - -void bli_sgemv_unf_var2 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - float* alpha, - float* a, inc_t rs_a, inc_t cs_a, - float* x, inc_t incx, - float* beta, - float* y, inc_t incy, - cntx_t* cntx - ) -{ - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - float* A1; - float* x1; - float* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_elem, &n_iter, &rs_at, &cs_at ); - - conja = bli_extract_conj( transa ); - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(s,type); - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(s,eq0)( *beta ) ) - { - float* zero = PASTEMAC(s,0); - /* y = 0; */ - PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(s,axpyf_ker_ft) kfp_af; - - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ - /* beta=0 case is hadled by scalv internally */ - - bli_sscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - NULL - ); - - if( bli_seq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; - } - - /* Query the context for the kernel function pointer and fusing factor. */ - b_fuse = 6; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - bli_saxpyf_zen_int_6 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - NULL - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); -} - - -void bli_zgemv_unf_var2 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - dcomplex* alpha, - dcomplex* a, inc_t rs_a, inc_t cs_a, - dcomplex* x, inc_t incx, - dcomplex* beta, - dcomplex* y, inc_t incy, - cntx_t* cntx - ) -{ - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - dcomplex* A1; - dcomplex* x1; - dcomplex* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_elem, &n_iter, &rs_at, &cs_at ); - - conja = bli_extract_conj( transa ); - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ - - /* beta=0 case is hadled by scalv internally */ - /* bli_zscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, - incy, - cntx - );*/ - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(z,type); - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(z,eq0)( *beta ) ) - { - dcomplex* zero = PASTEMAC(z,0); - /* y = 0; */ - PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(z,axpyf_ker_ft) kfp_af; - - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - bli_zscalv_ex - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - - if( bli_zeq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - // for non-unit incx, incy and rs_at and conjugate will be added in the next patch - if( (incx == 1 && incy == 1 && rs_at == 1 ) && - !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa)) - { - // This gemv code deals with the followint conditions only - // 1. incx, incy, and row stride equal to one - // 2. Non conjugate A matrix and X vector - // 3. No Transpose for A Martix - // Rest is taken care by the else part (axpyf implementation) - bli_zgemv_zen_int_4x4 - ( - conja, - conjx, - m, - n, - alpha, - a, rs_at, cs_at, - x, incx, - beta, - y, incy, - NULL - ); - } - else - { - /* fusing factor */ - b_fuse = 4; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - bli_zaxpyf_zen_int_4 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - NULL - ); - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); -} - -void bli_cgemv_unf_var2 - ( - trans_t transa, - conj_t conjx, - dim_t m, - dim_t n, - scomplex* alpha, - scomplex* a, inc_t rs_a, inc_t cs_a, - scomplex* x, inc_t incx, - scomplex* beta, - scomplex* y, inc_t incy, - cntx_t* cntx - ) -{ - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - scomplex* A1; - scomplex* x1; - scomplex* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; - - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_elem, &n_iter, &rs_at, &cs_at ); - - conja = bli_extract_conj( transa ); - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ - /* beta=0 case is hadled by scalv internally */ - /*bli_cscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, - incy, - cntx - );*/ - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - const num_t dt = PASTEMAC(c,type); - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(c,eq0)( *beta ) ) - { - scomplex* zero = PASTEMAC(c,0); - /* y = 0; */ - PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(c,axpyf_ker_ft) kfp_af; - - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } - - bli_cscalv_ex - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - - - - if( bli_ceq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; - } - - // for non-unit incx, incy and rs_at and conjugate will be added in the next patch - if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) && - !bli_is_conj(conja) && !bli_is_conj(conjx) && - !bli_is_trans(transa)) - { - // This gemv code deals with the followint conditions only - // 1. incx, incy, and row stride equal to one - // 2. Non conjugate A matrix and X vector - // 3. No Transpose for A Martix - // Rest is taken care by the else part (axpyf implementation) - bli_cgemv_zen_int_4x4 - ( - conja, - conjx, - m, - n, - alpha, - a, rs_at, cs_at, - x, incx, - beta, - y, incy, - NULL - ); - } - else - { - /* fusing factor. */ - b_fuse = 4; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - bli_caxpyf_zen_int_4 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - NULL - ); - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); -} - - -#else -INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) -#endif +INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) \ No newline at end of file diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c new file mode 100644 index 000000000..d7f5145e3 --- /dev/null +++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c @@ -0,0 +1,879 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#define BLIS_DGEMV_VAR2_FUSE 4 + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); \ +\ + bli_init_once(); \ +\ + if(cntx == NULL) cntx = bli_gks_query_cntx(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* zero = PASTEMAC(ch,0); \ + ctype* A1; \ + ctype* x1; \ + ctype* y1; \ + dim_t i; \ + dim_t b_fuse, f; \ + dim_t n_elem, n_iter; \ + inc_t rs_at, cs_at; \ + conj_t conja; \ +\ + bli_set_dims_incs_with_trans( transa, \ + m, n, rs_a, cs_a, \ + &n_elem, &n_iter, &rs_at, &cs_at ); \ +\ + conja = bli_extract_conj( transa ); \ +\ + /* If beta is zero, use setv. Otherwise, scale by beta. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + /* y = 0; */ \ + PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + zero, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + /* y = beta * y; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + beta, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ +\ + PASTECH(ch,axpyf_ker_ft) kfp_af; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ +\ + for ( i = 0; i < n_iter; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ +\ + A1 = a + (0 )*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + y1 = y + (0 )*incy; \ +\ + /* y = y + alpha * A1 * x1; */ \ + kfp_af \ + ( \ + conja, \ + conjx, \ + n_elem, \ + f, \ + alpha, \ + A1, rs_at, cs_at, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ + } \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \ +} + +void bli_dgemv_unf_var2 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + double* A1; + double* x1; + dim_t i; + dim_t f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + //memory pool declarations for packing vector Y. + mem_t mem_bufY; + rntm_t rntm; + double *y_buf = y; + inc_t buf_incy = incy; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here. + bli_init_once(); + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + const num_t dt = PASTEMAC(d,type); + double* x1; + double* y1; + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(d,eq0)( *beta ) ) + { + double* zero = PASTEMAC(d,0); + /* y = 0; */ + PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(d,axpyf_ker_ft) kfp_af; + + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); + dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + kfp_af + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + /* y = beta * y; */ + /* beta=0 case is hadled by scalv internally */ + + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx + ); + + if( bli_deq0( *alpha ) ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; + } + + if (incy > 1) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0; + mem_bufY.buf_type = 0; mem_bufY.size = 0; + mem_bufY.pool = NULL; + + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ + + bli_rntm_init_from_global( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + //calculate the size required for n_elem double elements in vector Y. + size_t buffer_size = n_elem * sizeof(double); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var2(): get mem pool block\n" ); + #endif + + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufY.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufY); + + /*Continue packing Y if buffer memory is allocated*/ + if ((bli_mem_is_alloc( &mem_bufY ))) + { + y_buf = bli_mem_buffer(&mem_bufY); + + //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride + for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) + { + *(y_buf + y_index) = *(y + (y_index * incy)) ; + } + // stride of vector y_buf =1 + buf_incy = 1; + } + } + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + + /* y = y + alpha * A1 * x1; */ + bli_daxpyf_zen_int_16x4 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y_buf, buf_incy, + cntx + ); + } + if ((incy > 1) && bli_mem_is_alloc( &mem_bufY )) + { + //store the result from unit strided y_buf to non-unit strided Y + for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) + { + *(y + (y_index * incy)) = *(y_buf + y_index) ; + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool + bli_membrk_release(&rntm , &mem_bufY); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + +void bli_sgemv_unf_var2 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + float* alpha, + float* a, inc_t rs_a, inc_t cs_a, + float* x, inc_t incx, + float* beta, + float* y, inc_t incy, + cntx_t* cntx + ) +{ + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + float* A1; + float* x1; + float* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here. + bli_init_once(); + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + const num_t dt = PASTEMAC(s,type); + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(s,eq0)( *beta ) ) + { + float* zero = PASTEMAC(s,0); + /* y = 0; */ + PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(s,axpyf_ker_ft) kfp_af; + + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + kfp_af + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + /* y = beta * y; */ + /* beta=0 case is hadled by scalv internally */ + bli_sscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx + ); + + if( bli_seq0( *alpha ) ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; + } + + /* Query the context for the kernel function pointer and fusing factor. */ + b_fuse = 6; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + bli_saxpyf_zen_int_6 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + + +void bli_zgemv_unf_var2 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + dcomplex* alpha, + dcomplex* a, inc_t rs_a, inc_t cs_a, + dcomplex* x, inc_t incx, + dcomplex* beta, + dcomplex* y, inc_t incy, + cntx_t* cntx + ) +{ + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + dcomplex* A1; + dcomplex* x1; + dcomplex* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here. + bli_init_once(); + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + /* y = beta * y; */ + + /* beta=0 case is hadled by scalv internally */ + /* bli_zscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, + incy, + cntx + );*/ + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + const num_t dt = PASTEMAC(z,type); + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(z,eq0)( *beta ) ) + { + dcomplex* zero = PASTEMAC(z,0); + /* y = 0; */ + PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(z,axpyf_ker_ft) kfp_af; + + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + kfp_af + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + bli_zscalv_ex + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + + if( bli_zeq0( *alpha ) ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + // for non-unit incx, incy and rs_at and conjugate will be added in the next patch + if( (incx == 1 && incy == 1 && rs_at == 1 ) && + !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa)) + { + // This gemv code deals with the followint conditions only + // 1. incx, incy, and row stride equal to one + // 2. Non conjugate A matrix and X vector + // 3. No Transpose for A Martix + // Rest is taken care by the else part (axpyf implementation) + bli_zgemv_zen_int_4x4 + ( + conja, + conjx, + m, + n, + alpha, + a, rs_at, cs_at, + x, incx, + beta, + y, incy, + cntx + ); + } + else + { + /* fusing factor */ + b_fuse = 4; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + bli_zaxpyf_zen_int_4 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + +void bli_cgemv_unf_var2 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + scomplex* alpha, + scomplex* a, inc_t rs_a, inc_t cs_a, + scomplex* x, inc_t incx, + scomplex* beta, + scomplex* y, inc_t incy, + cntx_t* cntx + ) +{ + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + scomplex* A1; + scomplex* x1; + scomplex* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here. + bli_init_once(); + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + /* y = beta * y; */ + /* beta=0 case is hadled by scalv internally */ + /*bli_cscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, + incy, + cntx + );*/ + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + const num_t dt = PASTEMAC(c,type); + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(c,eq0)( *beta ) ) + { + scomplex* zero = PASTEMAC(c,0); + /* y = 0; */ + PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(c,axpyf_ker_ft) kfp_af; + + /* Query the context for the kernel function pointer and fusing factor. */ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + kfp_af + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + + bli_cscalv_ex + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + cntx, + NULL + ); + + + + if( bli_ceq0( *alpha ) ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; + } + + // for non-unit incx, incy and rs_at and conjugate will be added in the next patch + if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) && + !bli_is_conj(conja) && !bli_is_conj(conjx) && + !bli_is_trans(transa)) + { + // This gemv code deals with the followint conditions only + // 1. incx, incy, and row stride equal to one + // 2. Non conjugate A matrix and X vector + // 3. No Transpose for A Martix + // Rest is taken care by the else part (axpyf implementation) + bli_cgemv_zen_int_4x4 + ( + conja, + conjx, + m, + n, + alpha, + a, rs_at, cs_at, + x, incx, + beta, + y, incy, + cntx + ); + } + else + { + /* fusing factor. */ + b_fuse = 4; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + bli_caxpyf_zen_int_4 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + cntx + ); + } + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + + + diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c index 6790e5bd0..e3229543c 100644 --- a/frame/2/hemv/bli_hemv_unf_var1.c +++ b/frame/2/hemv/bli_hemv_unf_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -216,207 +216,5 @@ void PASTEMAC(ch,varname) \ } \ } -#ifdef BLIS_CONFIG_EPYC - -void bli_post_hemv_8x8 - ( - double *a, - double *x, - double *y, - double *alpha, - dim_t cs_a, - dim_t rs_a - ); - -void bli_dhemv_unf_var1 - ( - uplo_t uplo, - conj_t conja, - conj_t conjx, - conj_t conjh, - dim_t m, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - double* beta, - double* y, inc_t incy, - cntx_t* cntx - ) -{ - const num_t dt = PASTEMAC(d,type); - - double* one = PASTEMAC(d,1); - double* zero = PASTEMAC(d,0); - double* A10; - double* A11; - double* a10t; - double* alpha11; - double* a21; - double* x0; - double* x1; - double* chi11; - double* y0; - double* y1; - double* y01; - double* psi11; - double* y21; - double conjx_chi11; - double alpha_chi11; - double alpha11_temp; - dim_t i, k, j; - dim_t b_fuse, f; - dim_t n_behind; - dim_t f_ahead, f_behind; - inc_t rs_at, cs_at; - conj_t conj0 = 0, conj1 = 0; - - /* The algorithm will be expressed in terms of the lower triangular - * case;the upper triangular case is supported by swapping the row - * and column strides of A and toggling some conj parameters. */ - if ( bli_is_lower( uplo ) ) - { - rs_at = rs_a; - cs_at = cs_a; - } - else /* if ( bli_is_upper( uplo ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - } - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(d,eq0)( *beta ) ) - { - /* y = 0; */ - PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker; - - /* Query the context for the kernel function pointer and fusing - * factor. */ - /* Assign kernel function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = ((id == BLIS_ARCH_ZEN4) ||(id == BLIS_ARCH_ZEN3) - || (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN)); - if (bamdzen) - { - kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8; - b_fuse = 8; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_dotxaxpyf_ker = - bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx); - b_fuse = - bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); - } - - for ( i = 0; i < m; i += f ) - { - f = bli_determine_blocksize_dim_f( i, m, b_fuse ); - n_behind = i; - A10 = a + (i )*rs_at + (0 )*cs_at; - A11 = a + (i )*rs_at + (i )*cs_at; - x0 = x + (0 )*incx; - x1 = x + (i )*incx; - y0 = y + (0 )*incy; - y1 = y + (i )*incy; - - /* y1 = y1 + alpha * A10 * x0; (dotxf) */ - /* y0 = y0 + alpha * A10' * x1; (axpyf) */ - kfp_dotxaxpyf_ker - ( - conj0, - conj1, - conjx, - conjx, - n_behind, - f, - alpha, - A10, cs_at, rs_at, - x0, incx, - x1, incx, - one, - y1, incy, - y0, incy, - cntx - ); - - /* y1 = y1 + alpha * A11 * x1; (variant 4) */ - if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1)) - { - /*this helper function handles unit stride only*/ - bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at); - } - else - { - for ( k = 0; k < f; ++k ) - { - f_behind = k; - f_ahead = f - k - 1; - a10t = A11 + (k )*rs_at + (0 )*cs_at; - alpha11 = A11 + (k )*rs_at + (k )*cs_at; - a21 = A11 + (k+1)*rs_at + (k )*cs_at; - chi11 = x1 + (k )*incx; - y01 = y1 + (0 )*incy; - psi11 = y1 + (k )*incy; - y21 = y1 + (k+1)*incy; - - /* y01 = y01 + alpha * a10t' * chi11; */ - PASTEMAC(d,copycjs)( conjx, *chi11, - conjx_chi11 ); - PASTEMAC(d,scal2s)( *alpha, conjx_chi11, - alpha_chi11 ); - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(d,axpys)( alpha_chi11, - *(a10t + j*cs_at), - *(y01 + j*incy) ); - - PASTEMAC(d,copycjs)( conja, *alpha11, - alpha11_temp ); - - /* psi11 = psi11 + alpha * alpha11 * chi11; */ - PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp, - *psi11 ); - - /* y21 = y21 + alpha * a21 * chi11; */ - for ( j = 0; j < f_ahead; ++j ) - { - PASTEMAC(d,axpys)( alpha_chi11, - *(a21 + j*rs_at), - *(y21 + j*incy) ); - } - } - } - } -} -GENTFUNC(float, s, hemv_unf_var1) -GENTFUNC(scomplex, c, hemv_unf_var1) -GENTFUNC(dcomplex, z, hemv_unf_var1) -#else INSERT_GENTFUNC_BASIC0( hemv_unf_var1 ) -#endif diff --git a/frame/2/hemv/bli_hemv_unf_var1_amd.c b/frame/2/hemv/bli_hemv_unf_var1_amd.c new file mode 100644 index 000000000..6532323d1 --- /dev/null +++ b/frame/2/hemv/bli_hemv_unf_var1_amd.c @@ -0,0 +1,418 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uplo, \ + conj_t conja, \ + conj_t conjx, \ + conj_t conjh, \ + dim_t m, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* one = PASTEMAC(ch,1); \ + ctype* zero = PASTEMAC(ch,0); \ + ctype* A10; \ + ctype* A11; \ + ctype* a10t; \ + ctype* alpha11; \ + ctype* a21; \ + ctype* x0; \ + ctype* x1; \ + ctype* chi11; \ + ctype* y0; \ + ctype* y1; \ + ctype* y01; \ + ctype* psi11; \ + ctype* y21; \ + ctype conjx_chi11; \ + ctype alpha_chi11; \ + ctype alpha11_temp; \ + dim_t i, k, j; \ + dim_t b_fuse, f; \ + dim_t n_behind; \ + dim_t f_ahead, f_behind; \ + inc_t rs_at, cs_at; \ + conj_t conj0, conj1; \ +\ + /* The algorithm will be expressed in terms of the lower triangular case; + the upper triangular case is supported by swapping the row and column + strides of A and toggling some conj parameters. */ \ + if ( bli_is_lower( uplo ) ) \ + { \ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = bli_apply_conj( conjh, conja ); \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + rs_at = cs_a; \ + cs_at = rs_a; \ +\ + conj0 = bli_apply_conj( conjh, conja ); \ + conj1 = conja; \ + } \ +\ + /* If beta is zero, use setv. Otherwise, scale by beta. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + /* y = 0; */ \ + PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + zero, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + /* y = beta * y; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + beta, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ +\ + PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ +\ + for ( i = 0; i < m; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ + n_behind = i; \ + A10 = a + (i )*rs_at + (0 )*cs_at; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + x0 = x + (0 )*incx; \ + x1 = x + (i )*incx; \ + y0 = y + (0 )*incy; \ + y1 = y + (i )*incy; \ +\ + /* y1 = y1 + alpha * A10 * x0; (dotxf) */ \ + /* y0 = y0 + alpha * A10' * x1; (axpyf) */ \ + kfp_xf \ + ( \ + conj0, \ + conj1, \ + conjx, \ + conjx, \ + n_behind, \ + f, \ + alpha, \ + A10, cs_at, rs_at, \ + x0, incx, \ + x1, incx, \ + one, \ + y1, incy, \ + y0, incy, \ + cntx \ + ); \ +\ + /* y1 = y1 + alpha * A11 * x1; (variant 4) */ \ + for ( k = 0; k < f; ++k ) \ + { \ + f_behind = k; \ + f_ahead = f - k - 1; \ + a10t = A11 + (k )*rs_at + (0 )*cs_at; \ + alpha11 = A11 + (k )*rs_at + (k )*cs_at; \ + a21 = A11 + (k+1)*rs_at + (k )*cs_at; \ + chi11 = x1 + (k )*incx; \ + y01 = y1 + (0 )*incy; \ + psi11 = y1 + (k )*incy; \ + y21 = y1 + (k+1)*incy; \ +\ + /* y01 = y01 + alpha * a10t' * chi11; */ \ + PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ + if ( bli_is_conj( conj1 ) ) \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ + } \ +\ + /* For hemv, explicitly set the imaginary component of alpha11 to + zero. */ \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ + if ( bli_is_conj( conjh ) ) \ + PASTEMAC(ch,seti0s)( alpha11_temp ); \ +\ + /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ + PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ +\ + /* y21 = y21 + alpha * a21 * chi11; */ \ + if ( bli_is_conj( conj0 ) ) \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ + } \ + } \ + } \ +} + +void bli_post_hemv_8x8 + ( + double *a, + double *x, + double *y, + double *alpha, + dim_t cs_a, + dim_t rs_a + ); + +void bli_dhemv_unf_var1 + ( + uplo_t uplo, + conj_t conja, + conj_t conjx, + conj_t conjh, + dim_t m, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + const num_t dt = PASTEMAC(d,type); + + double* one = PASTEMAC(d,1); + double* zero = PASTEMAC(d,0); + double* A10; + double* A11; + double* a10t; + double* alpha11; + double* a21; + double* x0; + double* x1; + double* chi11; + double* y0; + double* y1; + double* y01; + double* psi11; + double* y21; + double conjx_chi11; + double alpha_chi11; + double alpha11_temp; + dim_t i, k, j; + dim_t b_fuse, f; + dim_t n_behind; + dim_t f_ahead, f_behind; + inc_t rs_at, cs_at; + conj_t conj0 = 0, conj1 = 0; + + /* The algorithm will be expressed in terms of the lower triangular + * case;the upper triangular case is supported by swapping the row + * and column strides of A and toggling some conj parameters. */ + if ( bli_is_lower( uplo ) ) + { + rs_at = rs_a; + cs_at = cs_a; + } + else /* if ( bli_is_upper( uplo ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + } + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(d,eq0)( *beta ) ) + { + /* y = 0; */ + PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker; + + /* Query the context for the kernel function pointer and fusing + * factor. */ + /* Assign kernel function pointer and fusing factor. */ + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8; + b_fuse = 8; + } + else + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + kfp_dotxaxpyf_ker = + bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx); + b_fuse = + bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); + } + + for ( i = 0; i < m; i += f ) + { + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); + n_behind = i; + A10 = a + (i )*rs_at + (0 )*cs_at; + A11 = a + (i )*rs_at + (i )*cs_at; + x0 = x + (0 )*incx; + x1 = x + (i )*incx; + y0 = y + (0 )*incy; + y1 = y + (i )*incy; + + /* y1 = y1 + alpha * A10 * x0; (dotxf) */ + /* y0 = y0 + alpha * A10' * x1; (axpyf) */ + kfp_dotxaxpyf_ker + ( + conj0, + conj1, + conjx, + conjx, + n_behind, + f, + alpha, + A10, cs_at, rs_at, + x0, incx, + x1, incx, + one, + y1, incy, + y0, incy, + cntx + ); + + /* y1 = y1 + alpha * A11 * x1; (variant 4) */ + if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1)) + { + /*this helper function handles unit stride only*/ + bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at); + } + else + { + for ( k = 0; k < f; ++k ) + { + f_behind = k; + f_ahead = f - k - 1; + a10t = A11 + (k )*rs_at + (0 )*cs_at; + alpha11 = A11 + (k )*rs_at + (k )*cs_at; + a21 = A11 + (k+1)*rs_at + (k )*cs_at; + chi11 = x1 + (k )*incx; + y01 = y1 + (0 )*incy; + psi11 = y1 + (k )*incy; + y21 = y1 + (k+1)*incy; + + /* y01 = y01 + alpha * a10t' * chi11; */ + PASTEMAC(d,copycjs)( conjx, *chi11, + conjx_chi11 ); + PASTEMAC(d,scal2s)( *alpha, conjx_chi11, + alpha_chi11 ); + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(d,axpys)( alpha_chi11, + *(a10t + j*cs_at), + *(y01 + j*incy) ); + + PASTEMAC(d,copycjs)( conja, *alpha11, + alpha11_temp ); + + /* psi11 = psi11 + alpha * alpha11 * chi11; */ + PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp, + *psi11 ); + + /* y21 = y21 + alpha * a21 * chi11; */ + for ( j = 0; j < f_ahead; ++j ) + { + PASTEMAC(d,axpys)( alpha_chi11, + *(a21 + j*rs_at), + *(y21 + j*incy) ); + } + } + } + } +} +GENTFUNC(float, s, hemv_unf_var1) +GENTFUNC(scomplex, c, hemv_unf_var1) +GENTFUNC(dcomplex, z, hemv_unf_var1) + + diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c index abf08dfda..b8e26cbcb 100644 --- a/frame/2/hemv/bli_hemv_unf_var3.c +++ b/frame/2/hemv/bli_hemv_unf_var3.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -216,210 +216,6 @@ void PASTEMAC(ch,varname) \ } \ } -#ifdef BLIS_CONFIG_EPYC - -void bli_pre_hemv_8x8 - ( - double *a, - double *x, - double *y, - double *alpha, - dim_t cs_a, - dim_t rs_a - ); - -void bli_dhemv_unf_var3 - ( - uplo_t uplo, - conj_t conja, - conj_t conjx, - conj_t conjh, - dim_t m, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - double* beta, - double* y, inc_t incy, - cntx_t* cntx - ) -{ - const num_t dt = PASTEMAC(d,type); - - double* one = PASTEMAC(d,1); - double* zero = PASTEMAC(d,0); - double* A11; - double* A21; - double* a10t; - double* alpha11; - double* a21; - double* x1; - double* x2; - double* chi11; - double* y1; - double* y2; - double* y01; - double* psi11; - double* y21; - double conjx_chi11; - double alpha_chi11; - double alpha11_temp; - dim_t i, k, j; - dim_t b_fuse, f; - dim_t n_ahead; - dim_t f_ahead, f_behind; - inc_t rs_at, cs_at; - conj_t conj0 = 0, conj1 = 0; - - /* The algorithm will be expressed in terms of the lower triangular - * case; the upper triangular case is supported by swapping the row - * and column strides of A and toggling some conj parameters. */ - if ( bli_is_lower( uplo ) ) - { - rs_at = rs_a; - cs_at = cs_a; - } - else /* if ( bli_is_upper( uplo ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - } - - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(d,eq0)( *beta ) ) - { - /* y = 0; */ - PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - beta, - y, incy, - cntx, - NULL - ); - } - - PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker; - - arch_t id = bli_arch_query_id(); - bool bamdzen = ((id == BLIS_ARCH_ZEN4) || (id == BLIS_ARCH_ZEN3) - || (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN)); - if (bamdzen) - { - kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8; - b_fuse = 8; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_dotxaxpyf_ker = - bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx); - b_fuse = - bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); - } - - for ( i = 0; i < m; i += f ) - { - f = bli_determine_blocksize_dim_f( i, m, b_fuse ); - n_ahead = m - i - f; - A11 = a + (i )*rs_at + (i )*cs_at; - A21 = a + (i+f)*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - y1 = y + (i )*incy; - y2 = y + (i+f)*incy; - - /* y1 = y1 + alpha * A11 * x1; (variant 4) */ - if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1)) - { - /*this helper function handles unit stride only*/ - bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at); - } - else - { - for ( k = 0; k < f; ++k ) - { - f_behind = k; - f_ahead = f - k - 1; - a10t = A11 + (k )*rs_at + (0 )*cs_at; - alpha11 = A11 + (k )*rs_at + (k )*cs_at; - a21 = A11 + (k+1)*rs_at + (k )*cs_at; - chi11 = x1 + (k )*incx; - y01 = y1 + (0 )*incy; - psi11 = y1 + (k )*incy; - y21 = y1 + (k+1)*incy; - - /* y01 = y01 + alpha * a10t' * chi11; */ - PASTEMAC(d,copycjs)( conjx, - *chi11, conjx_chi11 ); - PASTEMAC(d,scal2s)( *alpha, conjx_chi11, - alpha_chi11 ); - { - for ( j = 0; j < f_behind; ++j ) - { - PASTEMAC(d,axpys) - ( alpha_chi11, - *(a10t + j*cs_at), - *(y01 + j*incy) ); - } - } - - PASTEMAC(d,copycjs)( conja, *alpha11, - alpha11_temp ); - - /* psi11 = psi11 + alpha * alpha11 * chi11; */ - PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp, - *psi11 ); - - /* y21 = y21 + alpha * a21 * chi11; */ - for ( j = 0; j < f_ahead; ++j ) - { - PASTEMAC(d,axpys)( alpha_chi11, - *(a21 + j*rs_at), - *(y21 + j*incy) ); - } - } - } - - /* y1 = y1 + alpha * A21' * x2; (dotxf) */ - /* y2 = y2 + alpha * A21 * x1; (axpyf) */ - kfp_dotxaxpyf_ker - ( - conj0, - conj1, - conjx, - conjx, - n_ahead, - f, - alpha, - A21, rs_at, cs_at, - x2, incx, - x1, incx, - one, - y1, incy, - y2, incy, - cntx - ); - } -} - -GENTFUNC(float, s, hemv_unf_var3) -GENTFUNC(scomplex, c, hemv_unf_var3) -GENTFUNC(dcomplex, z, hemv_unf_var3) -#else INSERT_GENTFUNC_BASIC0( hemv_unf_var3 ) -#endif + diff --git a/frame/2/hemv/bli_hemv_unf_var3_amd.c b/frame/2/hemv/bli_hemv_unf_var3_amd.c new file mode 100644 index 000000000..34d40cf5c --- /dev/null +++ b/frame/2/hemv/bli_hemv_unf_var3_amd.c @@ -0,0 +1,420 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uplo, \ + conj_t conja, \ + conj_t conjx, \ + conj_t conjh, \ + dim_t m, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* one = PASTEMAC(ch,1); \ + ctype* zero = PASTEMAC(ch,0); \ + ctype* A11; \ + ctype* A21; \ + ctype* a10t; \ + ctype* alpha11; \ + ctype* a21; \ + ctype* x1; \ + ctype* x2; \ + ctype* chi11; \ + ctype* y1; \ + ctype* y2; \ + ctype* y01; \ + ctype* psi11; \ + ctype* y21; \ + ctype conjx_chi11; \ + ctype alpha_chi11; \ + ctype alpha11_temp; \ + dim_t i, k, j; \ + dim_t b_fuse, f; \ + dim_t n_ahead; \ + dim_t f_ahead, f_behind; \ + inc_t rs_at, cs_at; \ + conj_t conj0, conj1; \ +\ + /* The algorithm will be expressed in terms of the lower triangular case; + the upper triangular case is supported by swapping the row and column + strides of A and toggling some conj parameters. */ \ + if ( bli_is_lower( uplo ) ) \ + { \ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = bli_apply_conj( conjh, conja ); \ + conj1 = conja; \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + rs_at = cs_a; \ + cs_at = rs_a; \ +\ + conj0 = conja; \ + conj1 = bli_apply_conj( conjh, conja ); \ + } \ +\ + /* If beta is zero, use setv. Otherwise, scale by beta. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + /* y = 0; */ \ + PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + zero, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + /* y = beta * y; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + beta, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ +\ + PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ +\ + for ( i = 0; i < m; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ + n_ahead = m - i - f; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + A21 = a + (i+f)*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + x2 = x + (i+f)*incx; \ + y1 = y + (i )*incy; \ + y2 = y + (i+f)*incy; \ +\ + /* y1 = y1 + alpha * A11 * x1; (variant 4) */ \ + for ( k = 0; k < f; ++k ) \ + { \ + f_behind = k; \ + f_ahead = f - k - 1; \ + a10t = A11 + (k )*rs_at + (0 )*cs_at; \ + alpha11 = A11 + (k )*rs_at + (k )*cs_at; \ + a21 = A11 + (k+1)*rs_at + (k )*cs_at; \ + chi11 = x1 + (k )*incx; \ + y01 = y1 + (0 )*incy; \ + psi11 = y1 + (k )*incy; \ + y21 = y1 + (k+1)*incy; \ +\ + /* y01 = y01 + alpha * a10t' * chi11; */ \ + PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ + if ( bli_is_conj( conj0 ) ) \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ + } \ +\ + /* For hemv, explicitly set the imaginary component of alpha11 to + zero. */ \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ + if ( bli_is_conj( conjh ) ) \ + PASTEMAC(ch,seti0s)( alpha11_temp ); \ +\ + /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ + PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ +\ + /* y21 = y21 + alpha * a21 * chi11; */ \ + if ( bli_is_conj( conj1 ) ) \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ + } \ + } \ +\ + /* y1 = y1 + alpha * A21' * x2; (dotxf) */ \ + /* y2 = y2 + alpha * A21 * x1; (axpyf) */ \ + kfp_xf \ + ( \ + conj0, \ + conj1, \ + conjx, \ + conjx, \ + n_ahead, \ + f, \ + alpha, \ + A21, rs_at, cs_at, \ + x2, incx, \ + x1, incx, \ + one, \ + y1, incy, \ + y2, incy, \ + cntx \ + ); \ + } \ +} + +void bli_pre_hemv_8x8 + ( + double *a, + double *x, + double *y, + double *alpha, + dim_t cs_a, + dim_t rs_a + ); + +void bli_dhemv_unf_var3 + ( + uplo_t uplo, + conj_t conja, + conj_t conjx, + conj_t conjh, + dim_t m, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + const num_t dt = PASTEMAC(d,type); + + double* one = PASTEMAC(d,1); + double* zero = PASTEMAC(d,0); + double* A11; + double* A21; + double* a10t; + double* alpha11; + double* a21; + double* x1; + double* x2; + double* chi11; + double* y1; + double* y2; + double* y01; + double* psi11; + double* y21; + double conjx_chi11; + double alpha_chi11; + double alpha11_temp; + dim_t i, k, j; + dim_t b_fuse, f; + dim_t n_ahead; + dim_t f_ahead, f_behind; + inc_t rs_at, cs_at; + conj_t conj0 = 0, conj1 = 0; + + /* The algorithm will be expressed in terms of the lower triangular + * case; the upper triangular case is supported by swapping the row + * and column strides of A and toggling some conj parameters. */ + if ( bli_is_lower( uplo ) ) + { + rs_at = rs_a; + cs_at = cs_a; + } + else /* if ( bli_is_upper( uplo ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + } + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + if ( PASTEMAC(d,eq0)( *beta ) ) + { + /* y = 0; */ + PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + zero, + y, incy, + cntx, + NULL + ); + } + else + { + /* y = beta * y; */ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + beta, + y, incy, + cntx, + NULL + ); + } + + PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8; + b_fuse = 8; + } + else + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + kfp_dotxaxpyf_ker = + bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx); + b_fuse = + bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); + } + + for ( i = 0; i < m; i += f ) + { + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); + n_ahead = m - i - f; + A11 = a + (i )*rs_at + (i )*cs_at; + A21 = a + (i+f)*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + y1 = y + (i )*incy; + y2 = y + (i+f)*incy; + + /* y1 = y1 + alpha * A11 * x1; (variant 4) */ + if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1)) + { + /*this helper function handles unit stride only*/ + bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at); + } + else + { + for ( k = 0; k < f; ++k ) + { + f_behind = k; + f_ahead = f - k - 1; + a10t = A11 + (k )*rs_at + (0 )*cs_at; + alpha11 = A11 + (k )*rs_at + (k )*cs_at; + a21 = A11 + (k+1)*rs_at + (k )*cs_at; + chi11 = x1 + (k )*incx; + y01 = y1 + (0 )*incy; + psi11 = y1 + (k )*incy; + y21 = y1 + (k+1)*incy; + + /* y01 = y01 + alpha * a10t' * chi11; */ + PASTEMAC(d,copycjs)( conjx, + *chi11, conjx_chi11 ); + PASTEMAC(d,scal2s)( *alpha, conjx_chi11, + alpha_chi11 ); + { + for ( j = 0; j < f_behind; ++j ) + { + PASTEMAC(d,axpys) + ( alpha_chi11, + *(a10t + j*cs_at), + *(y01 + j*incy) ); + } + } + + PASTEMAC(d,copycjs)( conja, *alpha11, + alpha11_temp ); + + /* psi11 = psi11 + alpha * alpha11 * chi11; */ + PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp, + *psi11 ); + + /* y21 = y21 + alpha * a21 * chi11; */ + for ( j = 0; j < f_ahead; ++j ) + { + PASTEMAC(d,axpys)( alpha_chi11, + *(a21 + j*rs_at), + *(y21 + j*incy) ); + } + } + } + + /* y1 = y1 + alpha * A21' * x2; (dotxf) */ + /* y2 = y2 + alpha * A21 * x1; (axpyf) */ + kfp_dotxaxpyf_ker + ( + conj0, + conj1, + conjx, + conjx, + n_ahead, + f, + alpha, + A21, rs_at, cs_at, + x2, incx, + x1, incx, + one, + y1, incy, + y2, incy, + cntx + ); + } +} + +GENTFUNC(float, s, hemv_unf_var3) +GENTFUNC(scomplex, c, hemv_unf_var3) +GENTFUNC(dcomplex, z, hemv_unf_var3) + + diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c index 299e3d161..a0aec48f7 100644 --- a/frame/2/her2/bli_her2_unf_var1.c +++ b/frame/2/her2/bli_her2_unf_var1.c @@ -158,217 +158,5 @@ void PASTEMAC(ch,varname) \ } \ } - -#ifdef BLIS_CONFIG_EPYC - -/** - * Following is function declaration - * that computes her2 for transposed case. - * It handles triangular part of matrix and - * remaining computation in optimal way to - * gain performance improvement. - * a is triangular matrix, x and y are vectors - */ -void bli_dher2_trans_zen_int_4 - ( - double *a, - double *x, - double *y, - double *alpha, - dim_t m, - dim_t lda - ); - -void bli_dher2_unf_var1 - ( - uplo_t uplo, - conj_t conjx, - conj_t conjy, - conj_t conjh, - dim_t m, - double* alpha, - double* x, inc_t incx, - double* y, inc_t incy, - double* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ) -{ - const num_t dt = PASTEMAC(d,type); - - double* x0; - double* chi1; - double* y0; - double* psi1; - double* c10t; - double* gamma11; - double alpha0; - double alpha1; - double alpha0_chi1; - double alpha1_psi1; - double alpha0_chi1_psi1; - double conjx0_chi1; - double conjy1_psi1; - double conjy0_psi1; - dim_t i; - dim_t n_behind; - inc_t rs_ct, cs_ct; - conj_t conj0, conj1; - - /* The algorithm will be expressed in terms of the lower triangular - * case;the upper triangular case is supported by swapping the row - * and column strides of A and toggling some conj parameters. - */ - if ( bli_is_lower( uplo ) ) - { - rs_ct = rs_c; - cs_ct = cs_c; - - PASTEMAC(d,copys)( *alpha, alpha0 ); - PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 ); - } - else /* if ( bli_is_upper( uplo ) ) */ - { - rs_ct = cs_c; - cs_ct = rs_c; - - /* Toggle conjugation of conjx/conjy, but only if we are being - * invoked as her2; for syr2, conjx/conjy are unchanged. - */ - conjx = bli_apply_conj( conjh, conjx ); - conjy = bli_apply_conj( conjh, conjy ); - - PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 ); - PASTEMAC(d,copys)( *alpha, alpha1 ); - } - - /* Apply conjh (which carries the conjugation component of the - * Hermitian transpose, if applicable) to conjx and/or conjy as - * needed to arrive at the effective conjugation for the vector - * subproblems. - */ - conj0 = bli_apply_conj( conjh, conjy ); - conj1 = bli_apply_conj( conjh, conjx ); - - PASTECH(d,axpy2v_ker_ft) kfp_2v; - - /* Query the context for the kernel function pointer. */ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); - - if( (incx == 1) && (incy == 1) && (rs_ct == 1)) - { - for ( i = 0; i < m; ) - { - n_behind = i; - x0 = x + (0 )*incx; - chi1 = x + (i )*incx; - y0 = y + (0 )*incy; - psi1 = y + (i )*incy; - c10t = c + (i )*rs_ct + (0 )*cs_ct; - gamma11 = c + (i )*rs_ct + (i )*cs_ct; - - if((n_behind >= 3)) - { - bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct); - i+=4; - } - else - { - /* Apply conjx and/or conjy to chi1 and/or psi1. */ - PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 ); - PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 ); - PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 ); - - /* Compute scalars for vector subproblems. */ - PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); - PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); - - /* Compute alpha * chi1 * conj(psi1) after both chi1 - * and psi1 have already been conjugated, if needed, - * by conjx and conjy. - */ - PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1, - alpha0_chi1_psi1 ); - - /* c10t = c10t + alpha * chi1 * y0'; */ - /* c10t = c10t + conj(alpha) * psi1 * x0'; */ - kfp_2v - ( - conj0, - conj1, - n_behind, - &alpha0_chi1, - &alpha1_psi1, - y0, incy, - x0, incx, - c10t, cs_ct, - cntx - ); - - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) - + conj(alpha) * psi1 * conj(chi1); */ - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - - i+=1; - } - } - } - else - { - for ( i = 0; i < m; ++i ) - { - n_behind = i; - x0 = x + (0 )*incx; - chi1 = x + (i )*incx; - y0 = y + (0 )*incy; - psi1 = y + (i )*incy; - c10t = c + (i )*rs_ct + (0 )*cs_ct; - gamma11 = c + (i )*rs_ct + (i )*cs_ct; - - /* Apply conjx and/or conjy to chi1 and/or psi1. */ - PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 ); - PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 ); - PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 ); - - /* Compute scalars for vector subproblems. */ - PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); - PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); - - /* Compute alpha * chi1 * conj(psi1) after both chi1 - * and psi1 have already been conjugated, if needed, - * by conjx and conjy. - */ - PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1, - alpha0_chi1_psi1 ); - - /* c10t = c10t + alpha * chi1 * y0'; */ - /* c10t = c10t + conj(alpha) * psi1 * x0'; */ - kfp_2v - ( - conj0, - conj1, - n_behind, - &alpha0_chi1, - &alpha1_psi1, - y0, incy, - x0, incx, - c10t, cs_ct, - cntx - ); - - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) - + conj(alpha) * psi1 * conj(chi1); */ - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - - } - } -} - -GENTFUNC(float, s, her2_unf_var1) -GENTFUNC(scomplex, c, her2_unf_var1) -GENTFUNC(dcomplex, z,her2_unf_var1) -#else INSERT_GENTFUNC_BASIC0( her2_unf_var1 ) -#endif diff --git a/frame/2/her2/bli_her2_unf_var1_amd.c b/frame/2/her2/bli_her2_unf_var1_amd.c new file mode 100644 index 000000000..43a74f49c --- /dev/null +++ b/frame/2/her2/bli_her2_unf_var1_amd.c @@ -0,0 +1,369 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uplo, \ + conj_t conjx, \ + conj_t conjy, \ + conj_t conjh, \ + dim_t m, \ + ctype* alpha, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* x0; \ + ctype* chi1; \ + ctype* y0; \ + ctype* psi1; \ + ctype* c10t; \ + ctype* gamma11; \ + ctype alpha0; \ + ctype alpha1; \ + ctype alpha0_chi1; \ + ctype alpha1_psi1; \ + ctype alpha0_chi1_psi1; \ + ctype conjx0_chi1; \ + ctype conjy1_psi1; \ + ctype conjy0_psi1; \ + dim_t i; \ + dim_t n_behind; \ + inc_t rs_ct, cs_ct; \ + conj_t conj0, conj1; \ +\ + /* The algorithm will be expressed in terms of the lower triangular case; + the upper triangular case is supported by swapping the row and column + strides of A and toggling some conj parameters. */ \ + if ( bli_is_lower( uplo ) ) \ + { \ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + rs_ct = cs_c; \ + cs_ct = rs_c; \ +\ + /* Toggle conjugation of conjx/conjy, but only if we are being invoked + as her2; for syr2, conjx/conjy are unchanged. */ \ + conjx = bli_apply_conj( conjh, conjx ); \ + conjy = bli_apply_conj( conjh, conjy ); \ +\ + PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ + } \ +\ + /* Apply conjh (which carries the conjugation component of the Hermitian + transpose, if applicable) to conjx and/or conjy as needed to arrive at + the effective conjugation for the vector subproblems. */ \ + conj0 = bli_apply_conj( conjh, conjy ); \ + conj1 = bli_apply_conj( conjh, conjx ); \ +\ + PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ +\ + /* Query the context for the kernel function pointer. */ \ + kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ +\ + for ( i = 0; i < m; ++i ) \ + { \ + n_behind = i; \ + x0 = x + (0 )*incx; \ + chi1 = x + (i )*incx; \ + y0 = y + (0 )*incy; \ + psi1 = y + (i )*incy; \ + c10t = c + (i )*rs_ct + (0 )*cs_ct; \ + gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ +\ + /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ + PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ + PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ +\ + /* Compute scalars for vector subproblems. */ \ + PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ +\ + /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have + already been conjugated, if needed, by conjx and conjy. */ \ + PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ +\ + /* c10t = c10t + alpha * chi1 * y0'; */ \ + /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ + kfp_2v \ + ( \ + conj0, \ + conj1, \ + n_behind, \ + &alpha0_chi1, \ + &alpha1_psi1, \ + y0, incy, \ + x0, incx, \ + c10t, cs_ct, \ + cntx \ + ); \ +\ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + + conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ +\ + /* For her2, explicitly set the imaginary component of gamma11 to + zero. */ \ + if ( bli_is_conj( conjh ) ) \ + PASTEMAC(ch,seti0s)( *gamma11 ); \ + } \ +} + +/** + * Following is function declaration + * that computes her2 for transposed case. + * It handles triangular part of matrix and + * remaining computation in optimal way to + * gain performance improvement. + * a is triangular matrix, x and y are vectors + */ +void bli_dher2_trans_zen_int_4 + ( + double *a, + double *x, + double *y, + double *alpha, + dim_t m, + dim_t lda + ); + +void bli_dher2_unf_var1 + ( + uplo_t uplo, + conj_t conjx, + conj_t conjy, + conj_t conjh, + dim_t m, + double* alpha, + double* x, inc_t incx, + double* y, inc_t incy, + double* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx + ) +{ + const num_t dt = PASTEMAC(d,type); + + double* x0; + double* chi1; + double* y0; + double* psi1; + double* c10t; + double* gamma11; + double alpha0; + double alpha1; + double alpha0_chi1; + double alpha1_psi1; + double alpha0_chi1_psi1; + double conjx0_chi1; + double conjy1_psi1; + double conjy0_psi1; + dim_t i; + dim_t n_behind; + inc_t rs_ct, cs_ct; + conj_t conj0, conj1; + + /* The algorithm will be expressed in terms of the lower triangular + * case;the upper triangular case is supported by swapping the row + * and column strides of A and toggling some conj parameters. + */ + if ( bli_is_lower( uplo ) ) + { + rs_ct = rs_c; + cs_ct = cs_c; + + PASTEMAC(d,copys)( *alpha, alpha0 ); + PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 ); + } + else /* if ( bli_is_upper( uplo ) ) */ + { + rs_ct = cs_c; + cs_ct = rs_c; + + /* Toggle conjugation of conjx/conjy, but only if we are being + * invoked as her2; for syr2, conjx/conjy are unchanged. + */ + conjx = bli_apply_conj( conjh, conjx ); + conjy = bli_apply_conj( conjh, conjy ); + + PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 ); + PASTEMAC(d,copys)( *alpha, alpha1 ); + } + + /* Apply conjh (which carries the conjugation component of the + * Hermitian transpose, if applicable) to conjx and/or conjy as + * needed to arrive at the effective conjugation for the vector + * subproblems. + */ + conj0 = bli_apply_conj( conjh, conjy ); + conj1 = bli_apply_conj( conjh, conjx ); + + PASTECH(d,axpy2v_ker_ft) kfp_2v; + + /* Query the context for the kernel function pointer. */ + kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); + + if( (incx == 1) && (incy == 1) && (rs_ct == 1)) + { + for ( i = 0; i < m; ) + { + n_behind = i; + x0 = x + (0 )*incx; + chi1 = x + (i )*incx; + y0 = y + (0 )*incy; + psi1 = y + (i )*incy; + c10t = c + (i )*rs_ct + (0 )*cs_ct; + gamma11 = c + (i )*rs_ct + (i )*cs_ct; + + if((n_behind >= 3)) + { + bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct); + i+=4; + } + else + { + /* Apply conjx and/or conjy to chi1 and/or psi1. */ + PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 ); + PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 ); + PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 ); + + /* Compute scalars for vector subproblems. */ + PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); + PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); + + /* Compute alpha * chi1 * conj(psi1) after both chi1 + * and psi1 have already been conjugated, if needed, + * by conjx and conjy. + */ + PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1, + alpha0_chi1_psi1 ); + + /* c10t = c10t + alpha * chi1 * y0'; */ + /* c10t = c10t + conj(alpha) * psi1 * x0'; */ + kfp_2v + ( + conj0, + conj1, + n_behind, + &alpha0_chi1, + &alpha1_psi1, + y0, incy, + x0, incx, + c10t, cs_ct, + cntx + ); + + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) + + conj(alpha) * psi1 * conj(chi1); */ + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + + i+=1; + } + } + } + else + { + for ( i = 0; i < m; ++i ) + { + n_behind = i; + x0 = x + (0 )*incx; + chi1 = x + (i )*incx; + y0 = y + (0 )*incy; + psi1 = y + (i )*incy; + c10t = c + (i )*rs_ct + (0 )*cs_ct; + gamma11 = c + (i )*rs_ct + (i )*cs_ct; + + /* Apply conjx and/or conjy to chi1 and/or psi1. */ + PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 ); + PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 ); + PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 ); + + /* Compute scalars for vector subproblems. */ + PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); + PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); + + /* Compute alpha * chi1 * conj(psi1) after both chi1 + * and psi1 have already been conjugated, if needed, + * by conjx and conjy. + */ + PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1, + alpha0_chi1_psi1 ); + + /* c10t = c10t + alpha * chi1 * y0'; */ + /* c10t = c10t + conj(alpha) * psi1 * x0'; */ + kfp_2v + ( + conj0, + conj1, + n_behind, + &alpha0_chi1, + &alpha1_psi1, + y0, incy, + x0, incx, + c10t, cs_ct, + cntx + ); + + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) + + conj(alpha) * psi1 * conj(chi1); */ + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + + } + } +} + +GENTFUNC(float, s, her2_unf_var1) +GENTFUNC(scomplex, c, her2_unf_var1) +GENTFUNC(dcomplex, z,her2_unf_var1) + + diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c index e39c7224c..3dea31d53 100644 --- a/frame/2/her2/bli_her2_unf_var4.c +++ b/frame/2/her2/bli_her2_unf_var4.c @@ -166,192 +166,5 @@ void PASTEMAC(ch,varname) \ } \ } -#ifdef BLIS_CONFIG_EPYC -/** - * Following is function declaration - * that computes her2 for transposed case. - * It handles triangular part of matrix and - * remaining computation in optimal way to - * gain performance improvement. - * a is triangular matrix, x and y are vectors - */ -void bli_dher2_zen_int_4 - ( - double *a, - double *x, - double *y, - double *alpha, - dim_t m, - dim_t lda - ); - -void bli_dher2_unf_var4 - ( - uplo_t uplo, - conj_t conjx, - conj_t conjy, - conj_t conjh, - dim_t m, - double* alpha, - double* x, inc_t incx, - double* y, inc_t incy, - double* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ) -{ - - double* chi1; - double* x2; - double* psi1; - double* y2; - double* gamma11; - double* c21; - double alpha0; - double alpha0_psi1; - double alpha1_chi1; - double alpha0_chi1_psi1; - dim_t i; - dim_t n_ahead; - inc_t rs_ct, cs_ct; - - const num_t dt = PASTEMAC(d,type); - - /* The algorithm will be expressed in terms of the lower triangular - * case; the upper triangular case is supported by swapping the row - * and column strides of A and toggling some conj parameters. - */ - if ( bli_is_lower( uplo ) ) - { - rs_ct = rs_c; - cs_ct = cs_c; - - PASTEMAC(d,copys)( *alpha, alpha0 ); - } - else /* if ( bli_is_upper( uplo ) ) */ - { - rs_ct = cs_c; - cs_ct = rs_c; - - /* Toggle conjugation of conjx/conjy, but only if we are being - * invoked as her2; for syr2, conjx/conjy are unchanged. - */ - - PASTEMAC(d,copys)( *alpha, alpha0 ); - } - /* Apply conjh (which carries the conjugation component of the - * Hermitian transpose, if applicable) to conjx and/or conjy as - * needed to arrive at the effective conjugation for the vector - * subproblems. - */ - - PASTECH(d,axpy2v_ker_ft) kfp_2v; - - /* Query the context for the kernel function pointer. */ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); - - if((incx == 1) && (incy == 1) && (rs_ct == 1)) - { - for ( i = 0; i < m; ) - { - n_ahead = m - i - 1; - chi1 = x + (i ) * incx; - x2 = x + (i+1) * incx; - psi1 = y + (i ) * incy; - y2 = y + (i+1) * incy; - gamma11 = c + (i ) + (i )*cs_ct; - c21 = c + (i+1) + (i )*cs_ct; - - if((n_ahead >= 3)) - { - bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct); - i+= 4; - } - else - { - /* Compute scalars for vector subproblems. */ - PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 ); - PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 ); - - /* Compute alpha * chi1 * conj(psi1) after both chi1 - * and psi1 have - already been conjugated, if needed, by conjx and - conjy. */ - PASTEMAC(d,scal2s)( alpha0_psi1, *chi1, - alpha0_chi1_psi1 ); - - /* c21 = c21 + alpha * x2 * conj(psi1); */ - /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ - - kfp_2v - ( - conjx, - conjy, - n_ahead, - &alpha0_psi1, - &alpha1_chi1, - x2, incx, - y2, incy, - c21, rs_ct, - cntx - ); - - - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - i+=1; - } - } - } - else - { - for ( i = 0; i < m; ++i) - { - n_ahead = m - i - 1; - chi1 = x + (i ) * incx; - x2 = x + (i+1) * incx; - psi1 = y + (i ) * incy; - y2 = y + (i+1) * incy; - gamma11 = c + (i ) + (i )*cs_ct; - c21 = c + (i+1) + (i )*cs_ct; - - /* Compute scalars for vector subproblems. */ - PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 ); - PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 ); - - /* Compute alpha * chi1 * conj(psi1) after both chi1 - * and psi1 have - already been conjugated, if needed, by conjx and - conjy. */ - PASTEMAC(d,scal2s)( alpha0_psi1, *chi1, - alpha0_chi1_psi1 ); - - /* c21 = c21 + alpha * x2 * conj(psi1); */ - /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ - - kfp_2v - ( - conjx, - conjy, - n_ahead, - &alpha0_psi1, - &alpha1_chi1, - x2, incx, - y2, incy, - c21, rs_ct, - cntx - ); - - - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); - } - } -} - -GENTFUNC(float, s, her2_unf_var4) -GENTFUNC(scomplex, c, her2_unf_var4) -GENTFUNC(dcomplex, z,her2_unf_var4) -#else INSERT_GENTFUNC_BASIC0( her2_unf_var4 ) -#endif diff --git a/frame/2/her2/bli_her2_unf_var4_amd.c b/frame/2/her2/bli_her2_unf_var4_amd.c new file mode 100644 index 000000000..4d77397cd --- /dev/null +++ b/frame/2/her2/bli_her2_unf_var4_amd.c @@ -0,0 +1,354 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uplo, \ + conj_t conjx, \ + conj_t conjy, \ + conj_t conjh, \ + dim_t m, \ + ctype* alpha, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* chi1; \ + ctype* x2; \ + ctype* psi1; \ + ctype* y2; \ + ctype* gamma11; \ + ctype* c21; \ + ctype alpha0; \ + ctype alpha1; \ + ctype alpha0_psi1; \ + ctype alpha1_chi1; \ + ctype alpha0_chi1_psi1; \ + ctype conjy0_psi1; \ + ctype conjx1_chi1; \ + ctype conjx0_chi1; \ + dim_t i; \ + dim_t n_ahead; \ + inc_t rs_ct, cs_ct; \ + conj_t conj0, conj1; \ + conj_t conjh_conjx; \ + conj_t conjh_conjy; \ +\ + /* Eliminate unused variable warnings. */ \ + ( void )conjh_conjx; \ + ( void )conjh_conjy; \ +\ + /* The algorithm will be expressed in terms of the lower triangular case; + the upper triangular case is supported by swapping the row and column + strides of A and toggling some conj parameters. */ \ + if ( bli_is_lower( uplo ) ) \ + { \ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + rs_ct = cs_c; \ + cs_ct = rs_c; \ +\ + /* Toggle conjugation of conjx/conjy, but only if we are being invoked + as her2; for syr2, conjx/conjy are unchanged. */ \ + conjx = bli_apply_conj( conjh, conjx ); \ + conjy = bli_apply_conj( conjh, conjy ); \ +\ + PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ + } \ +\ + /* Apply conjh (which carries the conjugation component of the Hermitian + transpose, if applicable) to conjx and/or conjy as needed to arrive at + the effective conjugation for the vector subproblems. */ \ + conj0 = conjx; \ + conj1 = conjy; \ + conjh_conjx = bli_apply_conj( conjh, conjx ); \ + conjh_conjy = bli_apply_conj( conjh, conjy ); \ +\ + PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ +\ + /* Query the context for the kernel function pointer. */ \ + kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ +\ + for ( i = 0; i < m; ++i ) \ + { \ + n_ahead = m - i - 1; \ + chi1 = x + (i )*incx; \ + x2 = x + (i+1)*incx; \ + psi1 = y + (i )*incy; \ + y2 = y + (i+1)*incy; \ + gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ + c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ +\ + /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ + PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ +\ + /* Compute scalars for vector subproblems. */ \ + PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ +\ + /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have + already been conjugated, if needed, by conjx and conjy. */ \ + PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ +\ + /* c21 = c21 + alpha * x2 * conj(psi1); */ \ + /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ + kfp_2v \ + ( \ + conj0, \ + conj1, \ + n_ahead, \ + &alpha0_psi1, \ + &alpha1_chi1, \ + x2, incx, \ + y2, incy, \ + c21, rs_ct, \ + cntx \ + ); \ +\ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + + conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ +\ + /* For her2, explicitly set the imaginary component of gamma11 to + zero. */ \ + if ( bli_is_conj( conjh ) ) \ + PASTEMAC(ch,seti0s)( *gamma11 ); \ + } \ +} + +/** + * Following is function declaration + * that computes her2 for transposed case. + * It handles triangular part of matrix and + * remaining computation in optimal way to + * gain performance improvement. + * a is triangular matrix, x and y are vectors + */ +void bli_dher2_zen_int_4 + ( + double *a, + double *x, + double *y, + double *alpha, + dim_t m, + dim_t lda + ); + +void bli_dher2_unf_var4 + ( + uplo_t uplo, + conj_t conjx, + conj_t conjy, + conj_t conjh, + dim_t m, + double* alpha, + double* x, inc_t incx, + double* y, inc_t incy, + double* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx + ) +{ + + double* chi1; + double* x2; + double* psi1; + double* y2; + double* gamma11; + double* c21; + double alpha0; + double alpha0_psi1; + double alpha1_chi1; + double alpha0_chi1_psi1; + dim_t i; + dim_t n_ahead; + inc_t rs_ct, cs_ct; + + const num_t dt = PASTEMAC(d,type); + + /* The algorithm will be expressed in terms of the lower triangular + * case; the upper triangular case is supported by swapping the row + * and column strides of A and toggling some conj parameters. + */ + if ( bli_is_lower( uplo ) ) + { + rs_ct = rs_c; + cs_ct = cs_c; + + PASTEMAC(d,copys)( *alpha, alpha0 ); + } + else /* if ( bli_is_upper( uplo ) ) */ + { + rs_ct = cs_c; + cs_ct = rs_c; + + /* Toggle conjugation of conjx/conjy, but only if we are being + * invoked as her2; for syr2, conjx/conjy are unchanged. + */ + + PASTEMAC(d,copys)( *alpha, alpha0 ); + } + /* Apply conjh (which carries the conjugation component of the + * Hermitian transpose, if applicable) to conjx and/or conjy as + * needed to arrive at the effective conjugation for the vector + * subproblems. + */ + + PASTECH(d,axpy2v_ker_ft) kfp_2v; + + /* Query the context for the kernel function pointer. */ + kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); + + if((incx == 1) && (incy == 1) && (rs_ct == 1)) + { + for ( i = 0; i < m; ) + { + n_ahead = m - i - 1; + chi1 = x + (i ) * incx; + x2 = x + (i+1) * incx; + psi1 = y + (i ) * incy; + y2 = y + (i+1) * incy; + gamma11 = c + (i ) + (i )*cs_ct; + c21 = c + (i+1) + (i )*cs_ct; + + if((n_ahead >= 3)) + { + bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct); + i+= 4; + } + else + { + /* Compute scalars for vector subproblems. */ + PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 ); + PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 ); + + /* Compute alpha * chi1 * conj(psi1) after both chi1 + * and psi1 have + already been conjugated, if needed, by conjx and + conjy. */ + PASTEMAC(d,scal2s)( alpha0_psi1, *chi1, + alpha0_chi1_psi1 ); + + /* c21 = c21 + alpha * x2 * conj(psi1); */ + /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ + + kfp_2v + ( + conjx, + conjy, + n_ahead, + &alpha0_psi1, + &alpha1_chi1, + x2, incx, + y2, incy, + c21, rs_ct, + cntx + ); + + + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + i+=1; + } + } + } + else + { + for ( i = 0; i < m; ++i) + { + n_ahead = m - i - 1; + chi1 = x + (i ) * incx; + x2 = x + (i+1) * incx; + psi1 = y + (i ) * incy; + y2 = y + (i+1) * incy; + gamma11 = c + (i ) + (i )*cs_ct; + c21 = c + (i+1) + (i )*cs_ct; + + /* Compute scalars for vector subproblems. */ + PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 ); + PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 ); + + /* Compute alpha * chi1 * conj(psi1) after both chi1 + * and psi1 have + already been conjugated, if needed, by conjx and + conjy. */ + PASTEMAC(d,scal2s)( alpha0_psi1, *chi1, + alpha0_chi1_psi1 ); + + /* c21 = c21 + alpha * x2 * conj(psi1); */ + /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ + + kfp_2v + ( + conjx, + conjy, + n_ahead, + &alpha0_psi1, + &alpha1_chi1, + x2, incx, + y2, incy, + c21, rs_ct, + cntx + ); + + + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 ); + } + } +} + +GENTFUNC(float, s, her2_unf_var4) +GENTFUNC(scomplex, c, her2_unf_var4) +GENTFUNC(dcomplex, z,her2_unf_var4) + + diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c index f2f9ea6a6..55e28a441 100644 --- a/frame/2/trsv/bli_trsv_unf_var1.c +++ b/frame/2/trsv/bli_trsv_unf_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -231,421 +231,4 @@ void PASTEMAC(ch,varname) \ } \ } -#ifdef BLIS_CONFIG_EPYC -void bli_dtrsv_unf_var1 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - cntx_t* cntx - ) -{ - - double* one = PASTEMAC(d,1); - double* minus_one = PASTEMAC(d,m1); - double* A10; - double* A11; - double* A12; - double* a10t; - double* alpha11; - double* a12t; - double* x0; - double* x1; - double* x2; - double* x01; - double* chi11; - double* x21; - double alpha11_conj; - double rho1; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_behind, f_behind; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(d,dotxf_ker_ft) kfp_df; - - /* Assign kernel function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_df = bli_ddotxf_zen_int_8; - b_fuse = 8; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - num_t dt = PASTEMAC(d,type); - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); - } - - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_behind = iter; - A11 = a + (i )*rs_at + (i )*cs_at; - A12 = a + (i )*rs_at + (i+f)*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 - A12 * x2; */ - kfp_df - ( - conja, - BLIS_NO_CONJUGATE, - n_behind, - f, - minus_one, - A12, cs_at, rs_at, - x2, incx, - one, - x1, incx, - cntx - ); - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_behind = k; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a12t = A11 + (l )*rs_at + (l+1)*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 - a12t * x21; */ - PASTEMAC(d,set0s)( rho1 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); - } - else - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); - } - PASTEMAC(d,subs)( rho1, *chi11 ); - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); - } - } - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_behind = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A10 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 - A10 * x0; */ - kfp_df - ( - conja, - BLIS_NO_CONJUGATE, - n_behind, - f, - minus_one, - A10, cs_at, rs_at, - x0, incx, - one, - x1, incx, - cntx - ); - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_behind = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a10t = A11 + (l )*rs_at + (0 )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 - a10t * x01; */ - PASTEMAC(d,set0s)( rho1 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); - } - else - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); - } - PASTEMAC(d,subs)( rho1, *chi11 ); - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); - } - } - } - } -} - -void bli_strsv_unf_var1 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - float* alpha, - float* a, inc_t rs_a, inc_t cs_a, - float* x, inc_t incx, - cntx_t* cntx - ) -{ - - float* one = PASTEMAC(s,1); - float* minus_one = PASTEMAC(s,m1); - float* A10; - float* A11; - float* A12; - float* a10t; - float* alpha11; - float* a12t; - float* x0; - float* x1; - float* x2; - float* x01; - float* chi11; - float* x21; - float alpha11_conj; - float rho1; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_behind, f_behind; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(s,dotxf_ker_ft) kfp_df; - - /* Assign kernel function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_df = bli_sdotxf_zen_int_8; - b_fuse = 8; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - num_t dt = PASTEMAC(s,type); - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); - - } - - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_behind = iter; - A11 = a + (i )*rs_at + (i )*cs_at; - A12 = a + (i )*rs_at + (i+f)*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 - A12 * x2; */ - kfp_df - ( - conja, - BLIS_NO_CONJUGATE, - n_behind, - f, - minus_one, - A12, cs_at, rs_at, - x2, incx, - one, - x1, incx, - cntx - ); - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_behind = k; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a12t = A11 + (l )*rs_at + (l+1)*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 - a12t * x21; */ - PASTEMAC(s,set0s)( rho1 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); - } - else - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); - } - PASTEMAC(s,subs)( rho1, *chi11 ); - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(s,invscals)( alpha11_conj, *chi11 ); - } - } - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_behind = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A10 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 - A10 * x0; */ - kfp_df - ( - conja, - BLIS_NO_CONJUGATE, - n_behind, - f, - minus_one, - A10, cs_at, rs_at, - x0, incx, - one, - x1, incx, - cntx - ); - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_behind = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a10t = A11 + (l )*rs_at + (0 )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 - a10t * x01; */ - PASTEMAC(s,set0s)( rho1 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); - } - else - { - for ( j = 0; j < f_behind; ++j ) - PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); - } - PASTEMAC(s,subs)( rho1, *chi11 ); - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(s,invscals)( alpha11_conj, *chi11 ); - } - } - } - } -} - -INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 ) -#else INSERT_GENTFUNC_BASIC0( trsv_unf_var1 ) -#endif diff --git a/frame/2/trsv/bli_trsv_unf_var1_amd.c b/frame/2/trsv/bli_trsv_unf_var1_amd.c new file mode 100644 index 000000000..4f026f2c6 --- /dev/null +++ b/frame/2/trsv/bli_trsv_unf_var1_amd.c @@ -0,0 +1,638 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + cntx_t* cntx \ + ) \ +{ \ + if(cntx == NULL) cntx = bli_gks_query_cntx(); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* one = PASTEMAC(ch,1); \ + ctype* minus_one = PASTEMAC(ch,m1); \ + ctype* A10; \ + ctype* A11; \ + ctype* A12; \ + ctype* a10t; \ + ctype* alpha11; \ + ctype* a12t; \ + ctype* x0; \ + ctype* x1; \ + ctype* x2; \ + ctype* x01; \ + ctype* chi11; \ + ctype* x21; \ + ctype alpha11_conj; \ + ctype rho1; \ + dim_t iter, i, k, j, l; \ + dim_t b_fuse, f; \ + dim_t n_behind, f_behind; \ + inc_t rs_at, cs_at; \ + uplo_t uploa_trans; \ + conj_t conja; \ +\ + /* x = alpha * x; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + alpha, \ + x, incx, \ + cntx, \ + NULL \ + ); \ +\ + if ( bli_does_notrans( transa ) ) \ + { \ + rs_at = rs_a; \ + cs_at = cs_a; \ + uploa_trans = uploa; \ + } \ + else /* if ( bli_does_trans( transa ) ) */ \ + { \ + rs_at = cs_a; \ + cs_at = rs_a; \ + uploa_trans = bli_uplo_toggled( uploa ); \ + } \ +\ + conja = bli_extract_conj( transa ); \ +\ + PASTECH(ch,dotxf_ker_ft) kfp_df; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ +\ + /* We reduce all of the possible cases down to just lower/upper. */ \ + if ( bli_is_upper( uploa_trans ) ) \ + { \ + for ( iter = 0; iter < m; iter += f ) \ + { \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ + n_behind = iter; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + A12 = a + (i )*rs_at + (i+f)*cs_at; \ + x1 = x + (i )*incx; \ + x2 = x + (i+f)*incx; \ +\ + /* x1 = x1 - A12 * x2; */ \ + kfp_df \ + ( \ + conja, \ + BLIS_NO_CONJUGATE, \ + n_behind, \ + f, \ + minus_one, \ + A12, cs_at, rs_at, \ + x2, incx, \ + one, \ + x1, incx, \ + cntx \ + ); \ +\ + /* x1 = x1 / triu( A11 ); */ \ + for ( k = 0; k < f; ++k ) \ + { \ + l = f - k - 1; \ + f_behind = k; \ + alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ + a12t = A11 + (l )*rs_at + (l+1)*cs_at; \ + chi11 = x1 + (l )*incx; \ + x21 = x1 + (l+1)*incx; \ +\ + /* chi11 = chi11 - a12t * x21; */ \ + PASTEMAC(ch,set0s)( rho1 ); \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ + } \ + else \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ + } \ + PASTEMAC(ch,subs)( rho1, *chi11 ); \ +\ + /* chi11 = chi11 / alpha11; */ \ + if ( bli_is_nonunit_diag( diaga ) ) \ + { \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ + PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_lower( uploa_trans ) ) */ \ + { \ + for ( iter = 0; iter < m; iter += f ) \ + { \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ + i = iter; \ + n_behind = i; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + A10 = a + (i )*rs_at + (0 )*cs_at; \ + x1 = x + (i )*incx; \ + x0 = x + (0 )*incx; \ +\ + /* x1 = x1 - A10 * x0; */ \ + kfp_df \ + ( \ + conja, \ + BLIS_NO_CONJUGATE, \ + n_behind, \ + f, \ + minus_one, \ + A10, cs_at, rs_at, \ + x0, incx, \ + one, \ + x1, incx, \ + cntx \ + ); \ +\ + /* x1 = x1 / tril( A11 ); */ \ + for ( k = 0; k < f; ++k ) \ + { \ + l = k; \ + f_behind = l; \ + alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ + a10t = A11 + (l )*rs_at + (0 )*cs_at; \ + chi11 = x1 + (l )*incx; \ + x01 = x1 + (0 )*incx; \ +\ + /* chi11 = chi11 - a10t * x01; */ \ + PASTEMAC(ch,set0s)( rho1 ); \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ + } \ + else \ + { \ + for ( j = 0; j < f_behind; ++j ) \ + PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ + } \ + PASTEMAC(ch,subs)( rho1, *chi11 ); \ +\ + /* chi11 = chi11 / alpha11; */ \ + if ( bli_is_nonunit_diag( diaga ) ) \ + { \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ + PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ + } \ + } \ + } \ + } \ +} + +void bli_dtrsv_unf_var1 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + cntx_t* cntx + ) +{ + + double* one = PASTEMAC(d,1); + double* minus_one = PASTEMAC(d,m1); + double* A10; + double* A11; + double* A12; + double* a10t; + double* alpha11; + double* a12t; + double* x0; + double* x1; + double* x2; + double* x01; + double* chi11; + double* x21; + double alpha11_conj; + double rho1; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_behind, f_behind; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + /* x = alpha * x; */ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(d,dotxf_ker_ft) kfp_df; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_df = bli_ddotxf_zen_int_8; + b_fuse = 8; + } + else + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + num_t dt = PASTEMAC(d,type); + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + } + + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_behind = iter; + A11 = a + (i )*rs_at + (i )*cs_at; + A12 = a + (i )*rs_at + (i+f)*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 - A12 * x2; */ + kfp_df + ( + conja, + BLIS_NO_CONJUGATE, + n_behind, + f, + minus_one, + A12, cs_at, rs_at, + x2, incx, + one, + x1, incx, + cntx + ); + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_behind = k; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a12t = A11 + (l )*rs_at + (l+1)*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 - a12t * x21; */ + PASTEMAC(d,set0s)( rho1 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); + } + else + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); + } + PASTEMAC(d,subs)( rho1, *chi11 ); + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); + } + } + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_behind = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A10 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 - A10 * x0; */ + kfp_df + ( + conja, + BLIS_NO_CONJUGATE, + n_behind, + f, + minus_one, + A10, cs_at, rs_at, + x0, incx, + one, + x1, incx, + cntx + ); + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_behind = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a10t = A11 + (l )*rs_at + (0 )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 - a10t * x01; */ + PASTEMAC(d,set0s)( rho1 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); + } + else + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); + } + PASTEMAC(d,subs)( rho1, *chi11 ); + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); + } + } + } + } +} + +void bli_strsv_unf_var1 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + float* alpha, + float* a, inc_t rs_a, inc_t cs_a, + float* x, inc_t incx, + cntx_t* cntx + ) +{ + + float* one = PASTEMAC(s,1); + float* minus_one = PASTEMAC(s,m1); + float* A10; + float* A11; + float* A12; + float* a10t; + float* alpha11; + float* a12t; + float* x0; + float* x1; + float* x2; + float* x01; + float* chi11; + float* x21; + float alpha11_conj; + float rho1; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_behind, f_behind; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + /* x = alpha * x; */ + PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(s,dotxf_ker_ft) kfp_df; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_df = bli_sdotxf_zen_int_8; + b_fuse = 8; + } + else + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + num_t dt = PASTEMAC(s,type); + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); + + } + + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_behind = iter; + A11 = a + (i )*rs_at + (i )*cs_at; + A12 = a + (i )*rs_at + (i+f)*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 - A12 * x2; */ + kfp_df + ( + conja, + BLIS_NO_CONJUGATE, + n_behind, + f, + minus_one, + A12, cs_at, rs_at, + x2, incx, + one, + x1, incx, + cntx + ); + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_behind = k; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a12t = A11 + (l )*rs_at + (l+1)*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 - a12t * x21; */ + PASTEMAC(s,set0s)( rho1 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); + } + else + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); + } + PASTEMAC(s,subs)( rho1, *chi11 ); + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(s,invscals)( alpha11_conj, *chi11 ); + } + } + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_behind = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A10 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 - A10 * x0; */ + kfp_df + ( + conja, + BLIS_NO_CONJUGATE, + n_behind, + f, + minus_one, + A10, cs_at, rs_at, + x0, incx, + one, + x1, incx, + cntx + ); + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_behind = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a10t = A11 + (l )*rs_at + (0 )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 - a10t * x01; */ + PASTEMAC(s,set0s)( rho1 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); + } + else + { + for ( j = 0; j < f_behind; ++j ) + PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); + } + PASTEMAC(s,subs)( rho1, *chi11 ); + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(s,invscals)( alpha11_conj, *chi11 ); + } + } + } + } +} + +INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 ) + diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c index 2fd89dacf..c0ef6abe4 100644 --- a/frame/2/trsv/bli_trsv_unf_var2.c +++ b/frame/2/trsv/bli_trsv_unf_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -228,805 +228,5 @@ void PASTEMAC(ch,varname) \ } \ } \ } -#ifdef BLIS_CONFIG_EPYC -void bli_dtrsv_unf_var2 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - double* alpha, - double* a, inc_t rs_a, inc_t cs_a, - double* x, inc_t incx, - cntx_t* cntx - ) -{ - double* minus_one = PASTEMAC(d,m1); - double* A01; - double* A11; - double* A21; - double* a01; - double* alpha11; - double* a21; - double* x0; - double* x1; - double* x2; - double* x01; - double* chi11; - double* x21; - double alpha11_conj; - double minus_chi11; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_ahead, f_ahead; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if ( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(d,axpyf_ker_ft) kfp_af; - - /* Assign kernel function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_af = bli_daxpyf_zen_int_16x4; - b_fuse = 4; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ); - } - - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_ahead = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A01 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_ahead = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a01 = A11 + (0 )*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); - } - - /* x01 = x01 - chi11 * a01; */ - PASTEMAC(d,neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(d,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(d,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - } - - /* x0 = x0 - A01 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A01, rs_at, cs_at, - x1, incx, - x0, incx, - cntx - ); - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_ahead = m - iter - f; - A11 = a + (i )*rs_at + (i )*cs_at; - A21 = a + (i+f)*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_ahead = f - k - 1; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a21 = A11 + (l+1)*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); - } - - /* x21 = x21 - chi11 * a21; */ - PASTEMAC(d,neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(d,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(d,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - } - - /* x2 = x2 - A21 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A21, rs_at, cs_at, - x1, incx, - x2, incx, - cntx - ); - } - } -} - -void bli_strsv_unf_var2 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - float* alpha, - float* a, inc_t rs_a, inc_t cs_a, - float* x, inc_t incx, - cntx_t* cntx - ) -{ - - float* minus_one = PASTEMAC(s, m1); - float* A01; - float* A11; - float* A21; - float* a01; - float* alpha11; - float* a21; - float* x0; - float* x1; - float* x2; - float* x01; - float* chi11; - float* x21; - float alpha11_conj; - float minus_chi11; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_ahead, f_ahead; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(s, scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(s, axpyf_ker_ft) kfp_af; - - /* Assign function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_af = bli_saxpyf_zen_int_5; - b_fuse = 5; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_FLOAT, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_AF, cntx ); - } - - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_ahead = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A01 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_ahead = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a01 = A11 + (0 )*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(s, invscals)( alpha11_conj, *chi11 ); - } - - /* x01 = x01 - chi11 * a01; */ - PASTEMAC(s, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(s, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(s, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - } - - /* x0 = x0 - A01 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A01, rs_at, cs_at, - x1, incx, - x0, incx, - cntx - ); - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_ahead = m - iter - f; - A11 = a + (i )*rs_at + (i )*cs_at; - A21 = a + (i+f)*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_ahead = f - k - 1; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a21 = A11 + (l+1)*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(s, invscals)( alpha11_conj, *chi11 ); - } - - /* x21 = x21 - chi11 * a21; */ - PASTEMAC(s, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(s, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(s, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - } - - /* x2 = x2 - A21 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A21, rs_at, cs_at, - x1, incx, - x2, incx, - cntx - ); - } - } -} - -void bli_ztrsv_unf_var2 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - dcomplex* alpha, - dcomplex* a, inc_t rs_a, inc_t cs_a, - dcomplex* x, inc_t incx, - cntx_t* cntx - ) -{ - - dcomplex* minus_one = PASTEMAC(z, m1); - dcomplex* A01; - dcomplex* A11; - dcomplex* A21; - dcomplex* a01; - dcomplex* alpha11; - dcomplex* a21; - dcomplex* x0; - dcomplex* x1; - dcomplex* x2; - dcomplex* x01; - dcomplex* chi11; - dcomplex* x21; - dcomplex alpha11_conj; - dcomplex minus_chi11; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_ahead, f_ahead; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(z, scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(z, axpyf_ker_ft) kfp_af; - - /* Assign function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_af = bli_zaxpyf_zen_int_5; - b_fuse = 5; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_AF, cntx ); - } - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_ahead = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A01 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_ahead = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a01 = A11 + (0 )*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(z, invscals)( alpha11_conj, *chi11 ); - } - - /* x01 = x01 - chi11 * a01; */ - PASTEMAC(z, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(z, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(z, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - } - - /* x0 = x0 - A01 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A01, rs_at, cs_at, - x1, incx, - x0, incx, - cntx - ); - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_ahead = m - iter - f; - A11 = a + (i )*rs_at + (i )*cs_at; - A21 = a + (i+f)*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_ahead = f - k - 1; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a21 = A11 + (l+1)*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(z, invscals)( alpha11_conj, *chi11 ); - } - - /* x21 = x21 - chi11 * a21; */ - PASTEMAC(z, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(z, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(z, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - } - - /* x2 = x2 - A21 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A21, rs_at, cs_at, - x1, incx, - x2, incx, - cntx - ); - } - } -} - -void bli_ctrsv_unf_var2 - ( - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, - scomplex* alpha, - scomplex* a, inc_t rs_a, inc_t cs_a, - scomplex* x, inc_t incx, - cntx_t* cntx - ) -{ - - scomplex* minus_one = PASTEMAC(c, m1); - scomplex* A01; - scomplex* A11; - scomplex* A21; - scomplex* a01; - scomplex* alpha11; - scomplex* a21; - scomplex* x0; - scomplex* x1; - scomplex* x2; - scomplex* x01; - scomplex* chi11; - scomplex* x21; - scomplex alpha11_conj; - scomplex minus_chi11; - dim_t iter, i, k, j, l; - dim_t b_fuse, f; - dim_t n_ahead, f_ahead; - inc_t rs_at, cs_at; - uplo_t uploa_trans; - conj_t conja; - - /* x = alpha * x; */ - PASTEMAC2(c, scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - m, - alpha, - x, incx, - cntx, - NULL - ); - - if( bli_does_notrans( transa ) ) - { - rs_at = rs_a; - cs_at = cs_a; - uploa_trans = uploa; - } - else /* if ( bli_does_trans( transa ) ) */ - { - rs_at = cs_a; - cs_at = rs_a; - uploa_trans = bli_uplo_toggled( uploa ); - } - - conja = bli_extract_conj( transa ); - - PASTECH(c, axpyf_ker_ft) kfp_af; - - /* Assign function pointer and fusing factor. */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - kfp_af = bli_caxpyf_zen_int_5; - b_fuse = 5; - } - else - { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_AF, cntx ); - } - /* We reduce all of the possible cases down to just lower/upper. */ - if ( bli_is_upper( uploa_trans ) ) - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); - i = m - iter - f; - n_ahead = i; - A11 = a + (i )*rs_at + (i )*cs_at; - A01 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x0 = x + (0 )*incx; - - /* x1 = x1 / triu( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = f - k - 1; - f_ahead = l; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a01 = A11 + (0 )*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x01 = x1 + (0 )*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(c, invscals)( alpha11_conj, *chi11 ); - } - - /* x01 = x01 - chi11 * a01; */ - PASTEMAC(c, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(c, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(c, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); - } - } - - /* x0 = x0 - A01 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A01, rs_at, cs_at, - x1, incx, - x0, incx, - cntx - ); - } - } - else /* if ( bli_is_lower( uploa_trans ) ) */ - { - for ( iter = 0; iter < m; iter += f ) - { - f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); - i = iter; - n_ahead = m - iter - f; - A11 = a + (i )*rs_at + (i )*cs_at; - A21 = a + (i+f)*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - x2 = x + (i+f)*incx; - - /* x1 = x1 / tril( A11 ); */ - for ( k = 0; k < f; ++k ) - { - l = k; - f_ahead = f - k - 1; - alpha11 = A11 + (l )*rs_at + (l )*cs_at; - a21 = A11 + (l+1)*rs_at + (l )*cs_at; - chi11 = x1 + (l )*incx; - x21 = x1 + (l+1)*incx; - - /* chi11 = chi11 / alpha11; */ - if ( bli_is_nonunit_diag( diaga ) ) - { - PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj ); - PASTEMAC(c, invscals)( alpha11_conj, *chi11 ); - } - - /* x21 = x21 - chi11 * a21; */ - PASTEMAC(c, neg2s)( *chi11, minus_chi11 ); - if ( bli_is_conj( conja ) ) - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(c, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - else - { - for ( j = 0; j < f_ahead; ++j ) - PASTEMAC(c, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); - } - } - - /* x2 = x2 - A21 * x1; */ - kfp_af - ( - conja, - BLIS_NO_CONJUGATE, - n_ahead, - f, - minus_one, - A21, rs_at, cs_at, - x1, incx, - x2, incx, - cntx - ); - } - } -} - -#else -INSERT_GENTFUNC_BASIC0( trsv_unf_var2 ) -#endif +INSERT_GENTFUNC_BASIC0( trsv_unf_var2 ) \ No newline at end of file diff --git a/frame/2/trsv/bli_trsv_unf_var2_amd.c b/frame/2/trsv/bli_trsv_unf_var2_amd.c new file mode 100644 index 000000000..51bbcabab --- /dev/null +++ b/frame/2/trsv/bli_trsv_unf_var2_amd.c @@ -0,0 +1,1024 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + bli_init_once(); \ +\ + if( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + ctype* minus_one = PASTEMAC(ch,m1); \ + ctype* A01; \ + ctype* A11; \ + ctype* A21; \ + ctype* a01; \ + ctype* alpha11; \ + ctype* a21; \ + ctype* x0; \ + ctype* x1; \ + ctype* x2; \ + ctype* x01; \ + ctype* chi11; \ + ctype* x21; \ + ctype alpha11_conj; \ + ctype minus_chi11; \ + dim_t iter, i, k, j, l; \ + dim_t b_fuse, f; \ + dim_t n_ahead, f_ahead; \ + inc_t rs_at, cs_at; \ + uplo_t uploa_trans; \ + conj_t conja; \ +\ + /* x = alpha * x; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m, \ + alpha, \ + x, incx, \ + cntx, \ + NULL \ + ); \ +\ + if ( bli_does_notrans( transa ) ) \ + { \ + rs_at = rs_a; \ + cs_at = cs_a; \ + uploa_trans = uploa; \ + } \ + else /* if ( bli_does_trans( transa ) ) */ \ + { \ + rs_at = cs_a; \ + cs_at = rs_a; \ + uploa_trans = bli_uplo_toggled( uploa ); \ + } \ +\ + conja = bli_extract_conj( transa ); \ +\ + PASTECH(ch,axpyf_ker_ft) kfp_af; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ +\ + /* We reduce all of the possible cases down to just lower/upper. */ \ + if ( bli_is_upper( uploa_trans ) ) \ + { \ + for ( iter = 0; iter < m; iter += f ) \ + { \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ + n_ahead = i; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + A01 = a + (0 )*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + x0 = x + (0 )*incx; \ +\ + /* x1 = x1 / triu( A11 ); */ \ + for ( k = 0; k < f; ++k ) \ + { \ + l = f - k - 1; \ + f_ahead = l; \ + alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ + a01 = A11 + (0 )*rs_at + (l )*cs_at; \ + chi11 = x1 + (l )*incx; \ + x01 = x1 + (0 )*incx; \ +\ + /* chi11 = chi11 / alpha11; */ \ + if ( bli_is_nonunit_diag( diaga ) ) \ + { \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ + PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ + } \ +\ + /* x01 = x01 - chi11 * a01; */ \ + PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ + } \ + } \ +\ + /* x0 = x0 - A01 * x1; */ \ + kfp_af \ + ( \ + conja, \ + BLIS_NO_CONJUGATE, \ + n_ahead, \ + f, \ + minus_one, \ + A01, rs_at, cs_at, \ + x1, incx, \ + x0, incx, \ + cntx \ + ); \ + } \ + } \ + else /* if ( bli_is_lower( uploa_trans ) ) */ \ + { \ + for ( iter = 0; iter < m; iter += f ) \ + { \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ + i = iter; \ + n_ahead = m - iter - f; \ + A11 = a + (i )*rs_at + (i )*cs_at; \ + A21 = a + (i+f)*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + x2 = x + (i+f)*incx; \ +\ + /* x1 = x1 / tril( A11 ); */ \ + for ( k = 0; k < f; ++k ) \ + { \ + l = k; \ + f_ahead = f - k - 1; \ + alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ + a21 = A11 + (l+1)*rs_at + (l )*cs_at; \ + chi11 = x1 + (l )*incx; \ + x21 = x1 + (l+1)*incx; \ +\ + /* chi11 = chi11 / alpha11; */ \ + if ( bli_is_nonunit_diag( diaga ) ) \ + { \ + PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ + PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ + } \ +\ + /* x21 = x21 - chi11 * a21; */ \ + PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ + } \ + else \ + { \ + for ( j = 0; j < f_ahead; ++j ) \ + PASTEMAC(ch,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ + } \ + } \ +\ + /* x2 = x2 - A21 * x1; */ \ + kfp_af \ + ( \ + conja, \ + BLIS_NO_CONJUGATE, \ + n_ahead, \ + f, \ + minus_one, \ + A21, rs_at, cs_at, \ + x1, incx, \ + x2, incx, \ + cntx \ + ); \ + } \ + } \ +} + +void bli_dtrsv_unf_var2 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + cntx_t* cntx + ) +{ + + double* minus_one = PASTEMAC(d,m1); + double* A01; + double* A11; + double* A21; + double* a01; + double* alpha11; + double* a21; + double* x0; + double* x1; + double* x2; + double* x01; + double* chi11; + double* x21; + double alpha11_conj; + double minus_chi11; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_ahead, f_ahead; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here + bli_init_once(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + /* x = alpha * x; */ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if ( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(d,axpyf_ker_ft) kfp_af; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_af = bli_daxpyf_zen_int_16x4; + b_fuse = 4; + } + else + { + kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ); + } + + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_ahead = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A01 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_ahead = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a01 = A11 + (0 )*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); + } + + /* x01 = x01 - chi11 * a01; */ + PASTEMAC(d,neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(d,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(d,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + } + + /* x0 = x0 - A01 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A01, rs_at, cs_at, + x1, incx, + x0, incx, + cntx + ); + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_ahead = m - iter - f; + A11 = a + (i )*rs_at + (i )*cs_at; + A21 = a + (i+f)*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_ahead = f - k - 1; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a21 = A11 + (l+1)*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(d,invscals)( alpha11_conj, *chi11 ); + } + + /* x21 = x21 - chi11 * a21; */ + PASTEMAC(d,neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(d,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(d,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + } + + /* x2 = x2 - A21 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A21, rs_at, cs_at, + x1, incx, + x2, incx, + cntx + ); + } + } +} + +void bli_strsv_unf_var2 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + float* alpha, + float* a, inc_t rs_a, inc_t cs_a, + float* x, inc_t incx, + cntx_t* cntx + ) +{ + + float* minus_one = PASTEMAC(s, m1); + float* A01; + float* A11; + float* A21; + float* a01; + float* alpha11; + float* a21; + float* x0; + float* x1; + float* x2; + float* x01; + float* chi11; + float* x21; + float alpha11_conj; + float minus_chi11; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_ahead, f_ahead; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here + bli_init_once(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + /* x = alpha * x; */ + PASTEMAC2(s, scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(s, axpyf_ker_ft) kfp_af; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_af = bli_saxpyf_zen_int_5; + b_fuse = 5; + } + else + { + kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_FLOAT, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_AF, cntx ); + } + + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_ahead = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A01 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_ahead = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a01 = A11 + (0 )*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(s, invscals)( alpha11_conj, *chi11 ); + } + + /* x01 = x01 - chi11 * a01; */ + PASTEMAC(s, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(s, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(s, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + } + + /* x0 = x0 - A01 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A01, rs_at, cs_at, + x1, incx, + x0, incx, + cntx + ); + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_ahead = m - iter - f; + A11 = a + (i )*rs_at + (i )*cs_at; + A21 = a + (i+f)*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_ahead = f - k - 1; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a21 = A11 + (l+1)*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(s, invscals)( alpha11_conj, *chi11 ); + } + + /* x21 = x21 - chi11 * a21; */ + PASTEMAC(s, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(s, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(s, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + } + + /* x2 = x2 - A21 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A21, rs_at, cs_at, + x1, incx, + x2, incx, + cntx + ); + } + } +} + +void bli_ztrsv_unf_var2 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + dcomplex* alpha, + dcomplex* a, inc_t rs_a, inc_t cs_a, + dcomplex* x, inc_t incx, + cntx_t* cntx + ) +{ + + dcomplex* minus_one = PASTEMAC(z, m1); + dcomplex* A01; + dcomplex* A11; + dcomplex* A21; + dcomplex* a01; + dcomplex* alpha11; + dcomplex* a21; + dcomplex* x0; + dcomplex* x1; + dcomplex* x2; + dcomplex* x01; + dcomplex* chi11; + dcomplex* x21; + dcomplex alpha11_conj; + dcomplex minus_chi11; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_ahead, f_ahead; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here + bli_init_once(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + /* x = alpha * x; */ + PASTEMAC2(z, scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(z, axpyf_ker_ft) kfp_af; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_af = bli_zaxpyf_zen_int_5; + b_fuse = 5; + } + else + { + kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_AF, cntx ); + } + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_ahead = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A01 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_ahead = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a01 = A11 + (0 )*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(z, invscals)( alpha11_conj, *chi11 ); + } + + /* x01 = x01 - chi11 * a01; */ + PASTEMAC(z, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(z, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(z, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + } + + /* x0 = x0 - A01 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A01, rs_at, cs_at, + x1, incx, + x0, incx, + cntx + ); + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_ahead = m - iter - f; + A11 = a + (i )*rs_at + (i )*cs_at; + A21 = a + (i+f)*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_ahead = f - k - 1; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a21 = A11 + (l+1)*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(z, invscals)( alpha11_conj, *chi11 ); + } + + /* x21 = x21 - chi11 * a21; */ + PASTEMAC(z, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(z, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(z, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + } + + /* x2 = x2 - A21 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A21, rs_at, cs_at, + x1, incx, + x2, incx, + cntx + ); + } + } +} + +void bli_ctrsv_unf_var2 + ( + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, + scomplex* alpha, + scomplex* a, inc_t rs_a, inc_t cs_a, + scomplex* x, inc_t incx, + cntx_t* cntx + ) +{ + + scomplex* minus_one = PASTEMAC(c, m1); + scomplex* A01; + scomplex* A11; + scomplex* A21; + scomplex* a01; + scomplex* alpha11; + scomplex* a21; + scomplex* x0; + scomplex* x1; + scomplex* x2; + scomplex* x01; + scomplex* chi11; + scomplex* x21; + scomplex alpha11_conj; + scomplex minus_chi11; + dim_t iter, i, k, j, l; + dim_t b_fuse, f; + dim_t n_ahead, f_ahead; + inc_t rs_at, cs_at; + uplo_t uploa_trans; + conj_t conja; + + // For AMD these APIS are invoked skipping intermediate framework layers + // Hence we need to ensure that cntx is set here + bli_init_once(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + /* x = alpha * x; */ + PASTEMAC2(c, scalv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + m, + alpha, + x, incx, + cntx, + NULL + ); + + if( bli_does_notrans( transa ) ) + { + rs_at = rs_a; + cs_at = cs_a; + uploa_trans = uploa; + } + else /* if ( bli_does_trans( transa ) ) */ + { + rs_at = cs_a; + cs_at = rs_a; + uploa_trans = bli_uplo_toggled( uploa ); + } + + conja = bli_extract_conj( transa ); + + PASTECH(c, axpyf_ker_ft) kfp_af; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + kfp_af = bli_caxpyf_zen_int_5; + b_fuse = 5; + } + else + { + kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYF_KER, cntx ); + b_fuse = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_AF, cntx ); + } + /* We reduce all of the possible cases down to just lower/upper. */ + if ( bli_is_upper( uploa_trans ) ) + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); + i = m - iter - f; + n_ahead = i; + A11 = a + (i )*rs_at + (i )*cs_at; + A01 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x0 = x + (0 )*incx; + + /* x1 = x1 / triu( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = f - k - 1; + f_ahead = l; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a01 = A11 + (0 )*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x01 = x1 + (0 )*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(c, invscals)( alpha11_conj, *chi11 ); + } + + /* x01 = x01 - chi11 * a01; */ + PASTEMAC(c, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(c, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(c, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); + } + } + + /* x0 = x0 - A01 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A01, rs_at, cs_at, + x1, incx, + x0, incx, + cntx + ); + } + } + else /* if ( bli_is_lower( uploa_trans ) ) */ + { + for ( iter = 0; iter < m; iter += f ) + { + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); + i = iter; + n_ahead = m - iter - f; + A11 = a + (i )*rs_at + (i )*cs_at; + A21 = a + (i+f)*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + x2 = x + (i+f)*incx; + + /* x1 = x1 / tril( A11 ); */ + for ( k = 0; k < f; ++k ) + { + l = k; + f_ahead = f - k - 1; + alpha11 = A11 + (l )*rs_at + (l )*cs_at; + a21 = A11 + (l+1)*rs_at + (l )*cs_at; + chi11 = x1 + (l )*incx; + x21 = x1 + (l+1)*incx; + + /* chi11 = chi11 / alpha11; */ + if ( bli_is_nonunit_diag( diaga ) ) + { + PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj ); + PASTEMAC(c, invscals)( alpha11_conj, *chi11 ); + } + + /* x21 = x21 - chi11 * a21; */ + PASTEMAC(c, neg2s)( *chi11, minus_chi11 ); + if ( bli_is_conj( conja ) ) + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(c, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + else + { + for ( j = 0; j < f_ahead; ++j ) + PASTEMAC(c, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); + } + } + + /* x2 = x2 - A21 * x1; */ + kfp_af + ( + conja, + BLIS_NO_CONJUGATE, + n_ahead, + f, + minus_one, + A21, rs_at, cs_at, + x1, incx, + x2, incx, + cntx + ); + } + } +} diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index 7ef4bdd49..909f48059 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -48,120 +48,6 @@ err_t bli_gemmsup_int { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4); -#ifdef BLIS_CONFIG_EPYC - const num_t dt = bli_obj_dt( c ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); - const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - const bool auto_factor = bli_rntm_auto_factor( rntm ); - const dim_t n_threads = bli_rntm_num_threads( rntm ); - - dim_t jc_new; - dim_t ic_new; - - - //bli_gemmsup_ref_var2 - //bli_gemmsup_ref_var1 - #if 0 - bli_gemmsup_ref_var1n - #else - #endif - const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); - const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || - stor_id == BLIS_RRC || - stor_id == BLIS_RCR || - stor_id == BLIS_CRR ); - #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) - printf( "bli_l3_sup_int(): var2m primary\n" ); - #endif - - // Don't use the small/unpacked implementation if one of the matrices - // uses general stride. - if ( stor_id == BLIS_XXX ) { - AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide."); - return BLIS_FAILURE; - } - - if ( is_rrr_rrc_rcr_crr ) - { - // This branch handles: - // - rrr rrc rcr crr for row-preferential kernels - // - rcc crc ccr ccc for column-preferential kernels - // - Currently only row-preferential kernels are only supported. - - // calculate number of micropanels in m and n dimensions and - // recalculate the automatic thread factorization based on these number of micropanels - const dim_t mu = m / MR; - const dim_t nu = n / NR; - - // If the parallel thread factorization was automatic, we update it - // with a new factorization based on the matrix dimensions in units - // of micropanels. - if ( auto_factor ) - { - // In the block-panel algorithm, the m dimension is parallelized - // with ic_nt and the n dimension is parallelized with jc_nt. - bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); - - // Update the ways of parallelism for the jc and ic loops, and then - // update the current thread's root thrinfo_t node according to the - // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); - } - - /*Enable packing for B matrix for higher sizes*/ - if(bli_is_float(dt) && (n_threads==1)) { - if((m > 240) && (k > 240) && (n > 240)) - bli_rntm_set_pack_b( 1, rntm ); - } - - bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, - alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); - } - else - { - // This branch handles: - // - rrr rrc rcr crr for column-preferential kernels - // - rcc crc ccr ccc for row-preferential kernels - // - Currently only row-preferential kernels are only supported. - const dim_t mu = n / MR; // the n becomes m after a transposition - const dim_t nu = m / NR; // the m becomes n after a transposition - - if ( auto_factor ) - { - // In the block-panel algorithm, the m dimension is parallelized - // with ic_nt and the n dimension is parallelized with jc_nt. - bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); - - // Update the ways of parallelism for the jc and ic loops, and then - // update the current thread's root thrinfo_t node according to the - // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); - } - - /* Enable packing for B matrix for higher sizes. Note that pack A - * becomes pack B inside var2m because this is transpose case*/ - if(bli_is_float(dt) && (n_threads==1)) { - if((m > 240) && (k > 240) && (n > 240)) - bli_rntm_set_pack_a( 1, rntm ); - } - - bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, - alpha, a, b, beta, c, - stor_id, cntx, rntm, thread ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4); - return BLIS_SUCCESS; - -#else // #ifdef BLIS_CONFIG_EPYC - const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices @@ -335,8 +221,6 @@ err_t bli_gemmsup_int // Return success so that the caller knows that we computed the solution. AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4) return BLIS_SUCCESS; - -#endif } // ----------------------------------------------------------------------------- @@ -401,15 +285,9 @@ err_t bli_gemmtsup_int // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. -#ifdef BLIS_CONFIG_EPYC - if ( mu >= nu ) use_bp = TRUE; - else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT - -#else if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; -#endif // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. @@ -472,14 +350,10 @@ err_t bli_gemmtsup_int // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. -#ifdef BLIS_CONFIG_EPYC - if ( mu >= nu ) use_bp = TRUE; - else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt -#else + if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; -#endif // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c new file mode 100644 index 000000000..7bd44266d --- /dev/null +++ b/frame/3/bli_l3_sup_int_amd.c @@ -0,0 +1,352 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +err_t bli_gemmsup_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4); + + const num_t dt = bli_obj_dt( c ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + const bool auto_factor = bli_rntm_auto_factor( rntm ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + dim_t jc_new; + dim_t ic_new; + + + //bli_gemmsup_ref_var2 + //bli_gemmsup_ref_var1 + #if 0 + bli_gemmsup_ref_var1n + #else + #endif + const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); + const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || + stor_id == BLIS_RRC || + stor_id == BLIS_RCR || + stor_id == BLIS_CRR ); + #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) + printf( "bli_l3_sup_int(): var2m primary\n" ); + #endif + + // Don't use the small/unpacked implementation if one of the matrices + // uses general stride. + if ( stor_id == BLIS_XXX ) { + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide."); + return BLIS_FAILURE; + } + + if ( is_rrr_rrc_rcr_crr ) + { + // This branch handles: + // - rrr rrc rcr crr for row-preferential kernels + // - rcc crc ccr ccc for column-preferential kernels + // - Currently only row-preferential kernels are only supported. + + // calculate number of micropanels in m and n dimensions and + // recalculate the automatic thread factorization based on these number of micropanels + const dim_t mu = m / MR; + const dim_t nu = n / NR; + + // If the parallel thread factorization was automatic, we update it + // with a new factorization based on the matrix dimensions in units + // of micropanels. + if ( auto_factor ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + /*Enable packing for B matrix for higher sizes*/ + if(bli_is_float(dt) && (n_threads==1)) { + if((m > 240) && (k > 240) && (n > 240)) + bli_rntm_set_pack_b( 1, rntm ); + } + + bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + } + else + { + // This branch handles: + // - rrr rrc rcr crr for column-preferential kernels + // - rcc crc ccr ccc for row-preferential kernels + // - Currently only row-preferential kernels are only supported. + const dim_t mu = n / MR; // the n becomes m after a transposition + const dim_t nu = m / NR; // the m becomes n after a transposition + + if ( auto_factor ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + /* Enable packing for B matrix for higher sizes. Note that pack A + * becomes pack B inside var2m because this is transpose case*/ + if(bli_is_float(dt) && (n_threads==1)) { + if((m > 240) && (k > 240) && (n > 240)) + bli_rntm_set_pack_a( 1, rntm ); + } + + bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4); + return BLIS_SUCCESS; + + +} + +// ----------------------------------------------------------------------------- + +err_t bli_gemmtsup_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4); +// AOCL_DTL_LOG_GEMMT_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c); + + + const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); + + // Don't use the small/unpacked implementation if one of the matrices + // uses general stride. + if ( stor_id == BLIS_XXX ) { + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide."); + return BLIS_FAILURE; + } + + const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || + stor_id == BLIS_RRC || + stor_id == BLIS_RCR || + stor_id == BLIS_CRR ); + const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; + + const num_t dt = bli_obj_dt( c ); + const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + + const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr + : is_rcc_crc_ccr_ccc ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = m; + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + const bool auto_factor = bli_rntm_auto_factor( rntm ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); + bool use_bp = TRUE; + dim_t jc_new; + dim_t ic_new; + + + if ( is_primary ) + { + // This branch handles: + // - rrr rrc rcr crr for row-preferential kernels + // - rcc crc ccr ccc for column-preferential kernels + + const dim_t mu = m / MR; + const dim_t nu = n / NR; + + // Decide which algorithm to use (block-panel var2m or panel-block + // var1n) based on the number of micropanels in the m and n dimensions. + // Also, recalculate the automatic thread factorization. + + if ( mu >= nu ) use_bp = TRUE; + else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT + + // If the parallel thread factorization was automatic, we update it + // with a new factorization based on the matrix dimensions in units + // of micropanels. + if ( auto_factor ) + { + if ( use_bp ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + } + else // if ( !use_bp ) + { + // In the panel-block algorithm, the m dimension is parallelized + // with jc_nt and the n dimension is parallelized with ic_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); + } + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + + if ( use_bp ) + { + #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) + printf( "bli_l3_sup_int(): var2m primary\n" ); + #endif + // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() + bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + } + else // use_pb + { + #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) + printf( "bli_l3_sup_int(): var1n primary\n" ); + #endif + // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() + bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + // *requires nudging of nc up to be a multiple of mr. + } + } + else + { + // This branch handles: + // - rrr rrc rcr crr for column-preferential kernels + // - rcc crc ccr ccc for row-preferential kernels + + const dim_t mu = n / MR; // the n becomes m after a transposition + const dim_t nu = m / NR; // the m becomes n after a transposition + + // Decide which algorithm to use (block-panel var2m or panel-block + // var1n) based on the number of micropanels in the m and n dimensions. + // Also, recalculate the automatic thread factorization. + + if ( mu >= nu ) use_bp = TRUE; + else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt + + // If the parallel thread factorization was automatic, we update it + // with a new factorization based on the matrix dimensions in units + // of micropanels. + if ( auto_factor ) + { + if ( use_bp ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + } + else // if ( !use_bp ) + { + // In the panel-block algorithm, the m dimension is parallelized + // with jc_nt and the n dimension is parallelized with ic_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); + } + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + + if ( use_bp ) + { + #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) + printf( "bli_l3_sup_int(): var2m non-primary\n" ); + #endif + // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans + bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + } + else // use_pb + { + #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) + printf( "bli_l3_sup_int(): var1n non-primary\n" ); + #endif + // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans + bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE, + alpha, a, b, beta, c, + stor_id, cntx, rntm, thread ); + // *requires nudging of mc up to be a multiple of nr. + } + } + + // Return success so that the caller knows that we computed the solution. + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4) + return BLIS_SUCCESS; +} + diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index a065156bb..972a7a782 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -177,19 +177,6 @@ void bli_gemm_front dim_t m_dim_local = bli_obj_length( &c_local ); dim_t n_dim_local = bli_obj_width( &c_local ); dim_t k_dim_local = bli_obj_width( &a_local ); -#ifdef BLIS_CONFIG_EPYC - // Regression observed in sgemm native path in cases where m >= 4 * n - // after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit - // 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for - // the issue. - if( bli_obj_is_float( &c_local ) && - ( n_dim_local >= 1024 ) && - ( k_dim_local >= 1024 ) && - ( m_dim_local >= ( 4 * n_dim_local ) ) ) - { - m_dim_local *= 2; - } -#endif // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any diff --git a/frame/3/gemm/bli_gemm_front_amd.c b/frame/3/gemm/bli_gemm_front_amd.c new file mode 100644 index 000000000..41af62007 --- /dev/null +++ b/frame/3/gemm/bli_gemm_front_amd.c @@ -0,0 +1,413 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); + bli_init_once(); + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + +#ifdef BLIS_ENABLE_SMALL_MATRIX + // Only handle small problems separately for homogeneous datatypes. + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_comp_prec( c ) == bli_obj_prec( c ) ) + { + err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl ); + + if ( status == BLIS_SUCCESS ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + return; + } + } +#endif + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + +#ifdef BLIS_ENABLE_GEMM_MD + cntx_t cntx_local; + + // If any of the storage datatypes differ, or if the computation precision + // differs from the storage precision of C, utilize the mixed datatype + // code path. + // NOTE: If we ever want to support the caller setting the computation + // domain explicitly, we will need to check the computation dt against the + // storage dt of C (instead of the computation precision against the + // storage precision of C). + if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) || + bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) || + bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) ) + { + // Handle mixed datatype cases in bli_gemm_md(), which may modify + // the objects or the context. (If the context is modified, cntx + // is adjusted to point to cntx_local.) + bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx ); + } + //else // homogeneous datatypes +#endif + + // Load the pack schemas from the context and embed them into the objects + // for A and B. (Native contexts are initialized with the correct pack + // schemas, as are contexts for 1m, and if necessary bli_gemm_md() would + // have made a copy and modified the schemas, so reading them from the + // context should be a safe bet at this point.) This is a sort of hack for + // communicating the desired pack schemas to bli_gemm_cntl_create() (via + // bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us + // to subsequently access the schemas from the control tree, which + // hopefully reduces some confusion, particularly in bli_packm_init(). + const pack_t schema_a = bli_cntx_schema_a_block( cntx ); + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + + // Next, we handle the possibility of needing to typecast alpha to the + // computation datatype and/or beta to the storage datatype of C. + + // Attach alpha to B, and in the process typecast alpha to the target + // datatype of the matrix (which in this case is equal to the computation + // datatype). + bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local ); + + // Attach beta to C, and in the process typecast beta to the target + // datatype of the matrix (which in this case is equal to the storage + // datatype of C). + bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local ); + + // Change the alpha and beta pointers to BLIS_ONE since the values have + // now been typecast and attached to the matrices above. + alpha = &BLIS_ONE; + beta = &BLIS_ONE; + +#ifdef BLIS_ENABLE_GEMM_MD + // Don't perform the following optimization for ccr or crc cases, as + // those cases are sensitive to the ukernel storage preference (ie: + // transposing the operation would break them). + if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && + !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) ) +#endif + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + + // We must also swap the pack schemas, which were set by bli_gemm_md() + // or the inlined code above. + bli_obj_swap_pack_schemas( &a_local, &b_local ); + } + + dim_t m_dim_local = bli_obj_length( &c_local ); + dim_t n_dim_local = bli_obj_width( &c_local ); + dim_t k_dim_local = bli_obj_width( &a_local ); + + // Regression observed in sgemm native path in cases where m >= 4 * n + // after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit + // 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for + // the issue. + if( bli_obj_is_float( &c_local ) && + ( n_dim_local >= 1024 ) && + ( k_dim_local >= 1024 ) && + ( m_dim_local >= ( 4 * n_dim_local ) ) ) + { + m_dim_local *= 2; + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + m_dim_local, + n_dim_local, + k_dim_local, + rntm + ); + + obj_t* cp = &c_local; + obj_t* betap = beta; + +#ifdef BLIS_ENABLE_GEMM_MD +#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + // If any of the following conditions are met, create a temporary matrix + // conformal to C into which we will accumulate the matrix product: + // - the storage precision of C differs from the computation precision; + // - the domains are mixed as crr; + // - the storage format of C does not match the preferred orientation + // of the ccr or crc cases. + // Then, after the computation is complete, this matrix will be copied + // or accumulated back to C. + const bool is_ccr_mismatch = + ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && + !bli_obj_is_col_stored( &c_local ) ); + const bool is_crc_mismatch = + ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) && + !bli_obj_is_row_stored( &c_local ) ); + + obj_t ct; + bool use_ct = FALSE; + + // FGVZ: Consider adding another guard here that only creates and uses a + // temporary matrix for accumulation if k < c * kc, where c is some small + // constant like 2. And don't forget to use the same conditional for the + // castm() and free() at the end. + if ( + bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) || + bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) || + is_ccr_mismatch || + is_crc_mismatch + ) + { + use_ct = TRUE; + } + + // If we need a temporary matrix conformal to C for whatever reason, + // we create it and prepare to use it now. + if ( use_ct ) + { + const dim_t m = bli_obj_length( &c_local ); + const dim_t n = bli_obj_width( &c_local ); + inc_t rs = bli_obj_row_stride( &c_local ); + inc_t cs = bli_obj_col_stride( &c_local ); + + num_t dt_ct = bli_obj_domain( &c_local ) | + bli_obj_comp_prec( &c_local ); + + // When performing the crr case, accumulate to a contiguously-stored + // real matrix so we do not have to repeatedly update C with general + // stride. + if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ) + dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local ); + + // When performing the mismatched ccr or crc cases, now is the time + // to specify the appropriate storage so the gemm_md_c2r_ref() virtual + // microkernel can output directly to C (instead of using a temporary + // microtile). + if ( is_ccr_mismatch ) { rs = 1; cs = m; } + else if ( is_crc_mismatch ) { rs = n; cs = 1; } + + bli_obj_create( dt_ct, m, n, rs, cs, &ct ); + + const num_t dt_exec = bli_obj_exec_dt( &c_local ); + const num_t dt_comp = bli_obj_comp_dt( &c_local ); + + bli_obj_set_target_dt( dt_ct, &ct ); + bli_obj_set_exec_dt( dt_exec, &ct ); + bli_obj_set_comp_dt( dt_comp, &ct ); + + // A naive approach would cast C to the comptuation datatype, + // compute with beta, and then cast the result back to the + // user-provided output matrix. However, we employ a different + // approach that halves the number of memops on C (or its + // typecast temporary) by writing the A*B product directly to + // temporary storage, and then using xpbym to scale the + // output matrix by beta and accumulate/cast the A*B product. + //bli_castm( &c_local, &ct ); + betap = &BLIS_ZERO; + + cp = &ct; + } +#endif +#endif + + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + bli_gemm_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + &b_local, + betap, + cp, + cntx, + rntm, + cntl + ); + +#ifdef BLIS_ENABLE_GEMM_MD +#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + // If we created a temporary matrix conformal to C for whatever reason, + // we copy/accumulate the result back to C and then release the object. + if ( use_ct ) + { + obj_t beta_local; + + bli_obj_scalar_detach( &c_local, &beta_local ); + + //bli_castnzm( &ct, &c_local ); + bli_xpbym( &ct, &beta_local, &c_local ); + + bli_obj_free( &ct ); + } +#endif +#endif + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); +} + +// ----------------------------------------------------------------------------- + +#if 0 + if ( bli_obj_dt( a ) != bli_obj_dt( b ) || + bli_obj_dt( a ) != bli_obj_dt( c ) || + bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) + { + const bool a_is_real = bli_obj_is_real( a ); + const bool a_is_comp = bli_obj_is_complex( a ); + const bool b_is_real = bli_obj_is_real( b ); + const bool b_is_comp = bli_obj_is_complex( b ); + const bool c_is_real = bli_obj_is_real( c ); + const bool c_is_comp = bli_obj_is_complex( c ); + + const bool a_is_single = bli_obj_is_single_prec( a ); + const bool a_is_double = bli_obj_is_double_prec( a ); + const bool b_is_single = bli_obj_is_single_prec( b ); + const bool b_is_double = bli_obj_is_double_prec( b ); + const bool c_is_single = bli_obj_is_single_prec( c ); + const bool c_is_double = bli_obj_is_double_prec( c ); + + const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC; + const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC; + + const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) || + bli_obj_domain( c ) != bli_obj_domain( b ); + + ( void )a_is_real; ( void )a_is_comp; + ( void )b_is_real; ( void )b_is_comp; + ( void )c_is_real; ( void )c_is_comp; + ( void )a_is_single; ( void )a_is_double; + ( void )b_is_single; ( void )b_is_double; + ( void )c_is_single; ( void )c_is_double; + ( void )comp_single; ( void )comp_double; + + if ( + //( c_is_comp && a_is_comp && b_is_real ) || + //( c_is_comp && a_is_real && b_is_comp ) || + //( c_is_real && a_is_comp && b_is_comp ) || + //( c_is_comp && a_is_real && b_is_real ) || + //( c_is_real && a_is_comp && b_is_real ) || + //( c_is_real && a_is_real && b_is_comp ) || + //FALSE + TRUE + ) + { + if ( + ( c_is_single && a_is_single && b_is_single && mixeddomain ) || + ( c_is_single && a_is_single && b_is_single && comp_single ) || + ( c_is_single && a_is_single && b_is_single && comp_double ) || + ( c_is_single && a_is_single && b_is_double ) || + ( c_is_single && a_is_double && b_is_single ) || + ( c_is_double && a_is_single && b_is_single ) || + ( c_is_single && a_is_double && b_is_double ) || + ( c_is_double && a_is_single && b_is_double ) || + ( c_is_double && a_is_double && b_is_single ) || + ( c_is_double && a_is_double && b_is_double && comp_single ) || + ( c_is_double && a_is_double && b_is_double && comp_double ) || + ( c_is_double && a_is_double && b_is_double && mixeddomain ) || + FALSE + ) + bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); + else + bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); + } + else + bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); + return; + } +#else +#if 0 + // If any of the storage datatypes differ, or if the execution precision + // differs from the storage precision of C, utilize the mixed datatype + // code path. + // NOTE: We could check the exec dt against the storage dt of C, but for + // now we don't support the caller setting the execution domain + // explicitly. + if ( bli_obj_dt( a ) != bli_obj_dt( b ) || + bli_obj_dt( a ) != bli_obj_dt( c ) || + bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) + { + bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); + return; + } +#endif +#endif + diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index d5d831554..98ea947f3 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -501,6 +501,25 @@ bool bli_cpuid_is_bulldozer return TRUE; } +bool bli_cpuid_is_avx_supported( void ) +{ + uint32_t family, model, features; + + // Call the CPUID instruction and parse its results into a family id, + // model id, and a feature bit field. The return value encodes the + // vendor. + bli_cpuid_query( &family, &model, &features ); + + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + return TRUE; +} + #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) arch_t bli_cpuid_query_id( void ) diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index a9f960847..cb4c45ab5 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -133,7 +133,7 @@ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) void get_cpu_name( char *cpu_name ); int vpu_count( void ); - +bool bli_cpuid_is_avx_supported(void); enum { @@ -160,6 +160,8 @@ enum FEATURE_AVX512VL = 0x4000 }; + + #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c index 214dfe67a..b1cf77e7b 100644 --- a/frame/compat/bla_amax.c +++ b/frame/compat/bla_amax.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -98,217 +98,5 @@ f77_int PASTEF772(i,chx,blasname) \ } #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC - -f77_int isamax_ - ( - const f77_int* n, - const float* x, const f77_int* incx - ) -{ - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx); - - dim_t n0; - float* x0; - inc_t incx0; - gint_t bli_index; - f77_int f77_index; - - /* If the vector is empty, return an index of zero. This early check - is needed to emulate netlib BLAS. Without it, bli_?amaxv() will - return 0, which ends up getting incremented to 1 (below) before - being returned, which is not what we want. */ - if ( *n < 1 || *incx <= 0 ) { - AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty"); - return 0; - } - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((float*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((float*)x); - incx0 = ( inc_t )(*incx); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel */ - bli_samaxv_zen_int - ( - n0, - x0, incx0, - &bli_index, - NULL - ); - } - else - { - PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF) - ( - n0, - x0, incx0, - &bli_index, - NULL, - NULL - ); - } - - /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) - index. Also, if the BLAS integer size differs from the BLIS - integer size, that typecast occurs here. */ - f77_index = bli_index + 1; - - /* Finalize BLIS. */ -// bli_finalize_auto(); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - - return f77_index; -} - -f77_int idamax_ - ( - const f77_int* n, - const double* x, const f77_int* incx - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx); - - dim_t n0; - double* x0; - inc_t incx0; - gint_t bli_index; - f77_int f77_index; - - /* If the vector is empty, return an index of zero. This early check - is needed to emulate netlib BLAS. Without it, bli_?amaxv() will - return 0, which ends up getting incremented to 1 (below) before - being returned, which is not what we want. */ - if ( *n < 1 || *incx <= 0 ) { - AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty"); - return 0; - } - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((double*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((double*)x); - incx0 = ( inc_t )(*incx); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel */ - bli_damaxv_zen_int - ( - n0, - x0, incx0, - &bli_index, - NULL - ); - } - else - { - PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF) - ( - n0, - x0, incx0, - &bli_index, - NULL, - NULL - ); - } - - /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) - index. Also, if the BLAS integer size differs from the BLIS - integer size, that typecast occurs here. */ - f77_index = bli_index + 1; - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return f77_index; -} - -INSERT_GENTFUNC_BLAS_CZ( amax, amaxv ) -#else INSERT_GENTFUNC_BLAS( amax, amaxv ) #endif -#endif diff --git a/frame/compat/bla_amax_amd.c b/frame/compat/bla_amax_amd.c new file mode 100644 index 000000000..7f1a771f7 --- /dev/null +++ b/frame/compat/bla_amax_amd.c @@ -0,0 +1,295 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype_x, chx, blasname, blisname ) \ +\ +f77_int PASTEF772(i,chx,blasname) \ + ( \ + const f77_int* n, \ + const ftype_x* x, const f77_int* incx \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ + AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(chx), *n, *incx) \ +\ + dim_t n0; \ + ftype_x* x0; \ + inc_t incx0; \ + gint_t bli_index; \ + f77_int f77_index; \ +\ + /* If the vector is empty, return an index of zero. This early check + is needed to emulate netlib BLAS. Without it, bli_?amaxv() will + return 0, which ends up getting incremented to 1 (below) before + being returned, which is not what we want. */ \ + if ( *n < 1 || *incx <= 0 ) { \ + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "iamax_: vector empty") \ + return 0; \ + }\ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + n0, \ + x0, incx0, \ + &bli_index, \ + NULL, \ + NULL \ + ); \ +\ + /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ \ + f77_index = bli_index + 1; \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + return f77_index; \ +} + +#ifdef BLIS_ENABLE_BLAS + +f77_int isamax_ + ( + const f77_int* n, + const float* x, const f77_int* incx + ) +{ + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx); + + dim_t n0; + float* x0; + inc_t incx0; + gint_t bli_index; + f77_int f77_index; + + /* If the vector is empty, return an index of zero. This early check + is needed to emulate netlib BLAS. Without it, bli_?amaxv() will + return 0, which ends up getting incremented to 1 (below) before + being returned, which is not what we want. */ + if ( *n < 1 || *incx <= 0 ) { + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty"); + return 0; + } + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((float*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel */ + bli_samaxv_zen_int + ( + n0, + x0, incx0, + &bli_index, + NULL + ); + } + else + { + PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF) + ( + n0, + x0, incx0, + &bli_index, + NULL, + NULL + ); + } + + /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ + f77_index = bli_index + 1; + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return f77_index; +} + +f77_int idamax_ + ( + const f77_int* n, + const double* x, const f77_int* incx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx); + + dim_t n0; + double* x0; + inc_t incx0; + gint_t bli_index; + f77_int f77_index; + + /* If the vector is empty, return an index of zero. This early check + is needed to emulate netlib BLAS. Without it, bli_?amaxv() will + return 0, which ends up getting incremented to 1 (below) before + being returned, which is not what we want. */ + if ( *n < 1 || *incx <= 0 ) { + AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty"); + return 0; + } + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((double*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel */ + bli_damaxv_zen_int + ( + n0, + x0, incx0, + &bli_index, + NULL + ); + } + else + { + PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF) + ( + n0, + x0, incx0, + &bli_index, + NULL, + NULL + ); + } + + /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ + f77_index = bli_index + 1; + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return f77_index; +} + +INSERT_GENTFUNC_BLAS_CZ( amax, amaxv ) + +#endif diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c index 93f30e1e5..1a30f417b 100644 --- a/frame/compat/bla_axpy.c +++ b/frame/compat/bla_axpy.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -87,411 +87,6 @@ void PASTEF77(ch,blasname) \ #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC -void saxpy_ -( - const f77_int* n, - const float* alpha, - const float* x, const f77_int* incx, - float* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy) - dim_t n0; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; - - /* Initialize BLIS. */ - // bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((float*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((float*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((float*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((float*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - bli_saxpyv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n0, - (float*)alpha, - x0, incx0, - y0, incy0, - NULL - ); - - } - else - { - PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - (float*)alpha, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - - } - /* Finalize BLIS. */ - // bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); -} - -void daxpy_ -( - const f77_int* n, - const double* alpha, - const double* x, const f77_int* incx, - double* y, const f77_int* incy - ) -{ - dim_t n0; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy) - /* Initialize BLIS. */ - // bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((double*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((double*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((double*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((double*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - bli_daxpyv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n0, - (double*)alpha, - x0, incx0, - y0, incy0, - NULL - ); - - } - else - { - PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - (double*)alpha, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - // bli_finalize_auto(); -} - -void caxpy_ -( - const f77_int* n, - const scomplex* alpha, - const scomplex* x, const f77_int* incx, - scomplex* y, const f77_int* incy - ) -{ - dim_t n0; - scomplex* x0; - scomplex* y0; - inc_t incx0; - inc_t incy0; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy) - - /* Initialize BLIS. */ - // bli_init_auto(); - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((scomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((scomplex*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((scomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((scomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - bli_caxpyv_zen_int5 - ( - BLIS_NO_CONJUGATE, - n0, - (scomplex*)alpha, - x0, incx0, - y0, incy0, - NULL - ); - - } - else - { - PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - (scomplex*)alpha, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - // bli_finalize_auto(); -} - -void zaxpy_ -( - const f77_int* n, - const dcomplex* alpha, - const dcomplex* x, const f77_int* incx, - dcomplex* y, const f77_int* incy - ) -{ - dim_t n0; - dcomplex* x0; - dcomplex* y0; - inc_t incx0; - inc_t incy0; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy) - - /* Initialize BLIS. */ - // bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((dcomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((dcomplex*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((dcomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((dcomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - bli_zaxpyv_zen_int5 - ( - BLIS_NO_CONJUGATE, - n0, - (dcomplex*)alpha, - x0, incx0, - y0, incy0, - NULL - ); - - } - else - { - PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - (dcomplex*)alpha, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - // bli_finalize_auto(); -} - -#else INSERT_GENTFUNC_BLAS( axpy, axpyv ) -#endif #endif diff --git a/frame/compat/bla_axpy_amd.c b/frame/compat/bla_axpy_amd.c new file mode 100644 index 000000000..8a9f0280c --- /dev/null +++ b/frame/compat/bla_axpy_amd.c @@ -0,0 +1,462 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* x, const f77_int* incx, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ + AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, (void*)alpha, *incx, *incy) \ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + (ftype*)alpha, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#ifdef BLIS_ENABLE_BLAS + +void saxpy_ +( + const f77_int* n, + const float* alpha, + const float* x, const f77_int* incx, + float* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy) + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ + // bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((float*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((float*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_saxpyv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n0, + (float*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + } + else + { + PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + (float*)alpha, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + + } + /* Finalize BLIS. */ + // bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + +void daxpy_ +( + const f77_int* n, + const double* alpha, + const double* x, const f77_int* incx, + double* y, const f77_int* incy + ) +{ + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy) + /* Initialize BLIS. */ + // bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((double*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((double*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_daxpyv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n0, + (double*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + } + else + { + PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + (double*)alpha, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + // bli_finalize_auto(); +} + +void caxpy_ +( + const f77_int* n, + const scomplex* alpha, + const scomplex* x, const f77_int* incx, + scomplex* y, const f77_int* incy + ) +{ + dim_t n0; + scomplex* x0; + scomplex* y0; + inc_t incx0; + inc_t incy0; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy) + + /* Initialize BLIS. */ + // bli_init_auto(); + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((scomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((scomplex*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((scomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((scomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_caxpyv_zen_int5 + ( + BLIS_NO_CONJUGATE, + n0, + (scomplex*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + } + else + { + PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + (scomplex*)alpha, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + // bli_finalize_auto(); +} + +void zaxpy_ +( + const f77_int* n, + const dcomplex* alpha, + const dcomplex* x, const f77_int* incx, + dcomplex* y, const f77_int* incy + ) +{ + dim_t n0; + dcomplex* x0; + dcomplex* y0; + inc_t incx0; + inc_t incy0; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy) + + /* Initialize BLIS. */ + // bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((dcomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((dcomplex*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((dcomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((dcomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_zaxpyv_zen_int5 + ( + BLIS_NO_CONJUGATE, + n0, + (dcomplex*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + } + else + { + PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + (dcomplex*)alpha, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + // bli_finalize_auto(); +} + + + +#endif diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c index f4aa3ee83..74baba689 100644 --- a/frame/compat/bla_copy.c +++ b/frame/compat/bla_copy.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -88,217 +88,5 @@ void PASTEF77(ch,blasname) \ } #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC - -void scopy_ -( - const f77_int* n, - const float* x, const f77_int* incx, - float* y, const f77_int* incy -) -{ - dim_t n0; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy) - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if (*n < 0) - n0 = (dim_t)0; - else - n0 = (dim_t)(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if (*incx < 0) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (float*)((x)+(n0 - 1)*(-*incx)); - incx0 = (inc_t)(*incx); - - } - else - { - x0 = (float*)(x); - incx0 = (inc_t)(*incx); - } - - if (*incy < 0) - { - y0 = (y)+(n0 - 1)*(-*incy); - incy0 = (inc_t)(*incy); - - } - else - { - y0 = (y); - incy0 = (inc_t)(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel */ - bli_scopyv_zen_int - ( - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - NULL - ); - } - else - { - PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - /* Finalize BLIS. */ -// bli_finalize_auto(); -} - -void dcopy_ -( - const f77_int* n, - const double* x, const f77_int* incx, - double* y, const f77_int* incy -) -{ - dim_t n0; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy) - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if (*n < 0) - n0 = (dim_t)0; - else - n0 = (dim_t)(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if (*incx < 0) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (double*)((x)+(n0 - 1)*(-*incx)); - incx0 = (inc_t)(*incx); - - } - else - { - x0 = (double*)(x); - incx0 = (inc_t)(*incx); - } - - if (*incy < 0) - { - y0 = (y)+(n0 - 1)*(-*incy); - incy0 = (inc_t)(*incy); - - } - else - { - y0 = (y); - incy0 = (inc_t)(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel */ - bli_dcopyv_zen_int - ( - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - NULL - ); - } - else - { - PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - NULL, - NULL - ); - } - - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - /* Finalize BLIS. */ -// bli_finalize_auto(); -} - -INSERT_GENTFUNC_BLAS_CZ(copy, copyv) -#else INSERT_GENTFUNC_BLAS(copy, copyv) #endif -#endif diff --git a/frame/compat/bla_copy_amd.c b/frame/compat/bla_copy_amd.c new file mode 100644 index 000000000..8dc4d5287 --- /dev/null +++ b/frame/compat/bla_copy_amd.c @@ -0,0 +1,285 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* x, const f77_int* incx, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy) \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \ + bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \ + \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \ + (\ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ + \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#ifdef BLIS_ENABLE_BLAS + +void scopy_ +( + const f77_int* n, + const float* x, const f77_int* incx, + float* y, const f77_int* incy +) +{ + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy) + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if (*n < 0) + n0 = (dim_t)0; + else + n0 = (dim_t)(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if (*incx < 0) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (float*)((x)+(n0 - 1)*(-*incx)); + incx0 = (inc_t)(*incx); + + } + else + { + x0 = (float*)(x); + incx0 = (inc_t)(*incx); + } + + if (*incy < 0) + { + y0 = (y)+(n0 - 1)*(-*incy); + incy0 = (inc_t)(*incy); + + } + else + { + y0 = (y); + incy0 = (inc_t)(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel */ + bli_scopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + } + else + { + PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +void dcopy_ +( + const f77_int* n, + const double* x, const f77_int* incx, + double* y, const f77_int* incy +) +{ + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy) + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if (*n < 0) + n0 = (dim_t)0; + else + n0 = (dim_t)(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if (*incx < 0) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (double*)((x)+(n0 - 1)*(-*incx)); + incx0 = (inc_t)(*incx); + + } + else + { + x0 = (double*)(x); + incx0 = (inc_t)(*incx); + } + + if (*incy < 0) + { + y0 = (y)+(n0 - 1)*(-*incy); + incy0 = (inc_t)(*incy); + + } + else + { + y0 = (y); + incy0 = (inc_t)(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel */ + bli_dcopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + } + else + { + PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL, + NULL + ); + } + + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +INSERT_GENTFUNC_BLAS_CZ(copy, copyv) + +#endif diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c index 419f8c7dc..3c4d8c538 100644 --- a/frame/compat/bla_dot.c +++ b/frame/compat/bla_dot.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -90,681 +90,11 @@ ftype PASTEF772(ch,blasname,chc) \ } #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC -float sdot_ - ( - const f77_int* n, - const float* x, const f77_int* incx, - const float* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy); - dim_t n0; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; - float rho; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((float*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((float*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((float*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((float*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_sdotv_zen_int10 - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return rho; -} - -double ddot_ - ( - const f77_int* n, - const double* x, const f77_int* incx, - const double* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy); - dim_t n0; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; - double rho; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((double*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((double*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((double*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((double*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_ddotv_zen_int10 - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return rho; -} -#else INSERT_GENTFUNCDOTR_BLAS( dot, dotv ) -#endif #ifdef BLIS_ENABLE_BLAS #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL -#ifdef BLIS_CONFIG_EPYC -scomplex cdotu_ - ( - const f77_int* n, - const scomplex* x, const f77_int* incx, - const scomplex* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy); - dim_t n0; - scomplex* x0; - scomplex* y0; - inc_t incx0; - inc_t incy0; - scomplex rho; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((scomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((scomplex*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((scomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((scomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_cdotv_zen_int5 - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return rho; -} - -dcomplex zdotu_ - ( - const f77_int* n, - const dcomplex* x, const f77_int* incx, - const dcomplex* y, const f77_int* incy - ) -{ - dim_t n0; - dcomplex* x0; - dcomplex* y0; - inc_t incx0; - inc_t incy0; - dcomplex rho; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy); - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((dcomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((dcomplex*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((dcomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((dcomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_zdotv_zen_int5 - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - - return rho; -} - - -scomplex cdotc_ - ( - const f77_int* n, - const scomplex* x, const f77_int* incx, - const scomplex* y, const f77_int* incy - ) -{ - dim_t n0; - scomplex* x0; - scomplex* y0; - inc_t incx0; - inc_t incy0; - scomplex rho; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy); - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((scomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((scomplex*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((scomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((scomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_cdotv_zen_int5 - ( - BLIS_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - - return rho; -} - -dcomplex zdotc_ - ( - const f77_int* n, - const dcomplex* x, const f77_int* incx, - const dcomplex* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy); - dim_t n0; - dcomplex* x0; - dcomplex* y0; - inc_t incx0; - inc_t incy0; - dcomplex rho; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = ((dcomplex*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = ((dcomplex*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((dcomplex*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = ((dcomplex*)y); - incy0 = ( inc_t )(*incy); - } - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) - { - /* Call BLIS kernel. */ - bli_zdotv_zen_int5 - ( - BLIS_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) - ( - BLIS_CONJUGATE, - BLIS_NO_CONJUGATE, - n0, - x0, incx0, - y0, incy0, - &rho, - NULL, - NULL - ); - } - - - - - - /* Finalize BLIS. */ -// bli_finalize_auto(); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - - return rho; -} -#else INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) -#endif #else // For the "intel" complex return type, use a hidden parameter to return the result #undef GENTFUNCDOT @@ -819,8 +149,8 @@ void PASTEF772(ch,blasname,chc) \ } INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) -#endif -#endif +#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL +#endif // BLIS_ENABLE_BLAS // -- "Black sheep" dot product function definitions -- @@ -894,4 +224,4 @@ double PASTEF77(d,sdot) return rho; } -#endif +#endif // BLIS_ENABLE_BLAS diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c new file mode 100644 index 000000000..0cdaa6535 --- /dev/null +++ b/frame/compat/bla_dot_amd.c @@ -0,0 +1,841 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNCDOT +#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ +\ +ftype PASTEF772(ch,blasname,chc) \ + ( \ + const f77_int* n, \ + const ftype* x, const f77_int* incx, \ + const ftype* y, const f77_int* incy \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ + ftype rho; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_conjx, \ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + &rho, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +\ + return rho; \ +} + +#ifdef BLIS_ENABLE_BLAS +float sdot_ + ( + const f77_int* n, + const float* x, const f77_int* incx, + const float* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy); + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + float rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((float*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((float*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_sdotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return rho; +} + +double ddot_ + ( + const f77_int* n, + const double* x, const f77_int* incx, + const double* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy); + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + double rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((double*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((double*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_ddotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return rho; +} + +#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL +scomplex cdotu_ + ( + const f77_int* n, + const scomplex* x, const f77_int* incx, + const scomplex* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy); + dim_t n0; + scomplex* x0; + scomplex* y0; + inc_t incx0; + inc_t incy0; + scomplex rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((scomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((scomplex*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((scomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((scomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_cdotv_zen_int5 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return rho; +} + +dcomplex zdotu_ + ( + const f77_int* n, + const dcomplex* x, const f77_int* incx, + const dcomplex* y, const f77_int* incy + ) +{ + dim_t n0; + dcomplex* x0; + dcomplex* y0; + inc_t incx0; + inc_t incy0; + dcomplex rho; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy); + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((dcomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((dcomplex*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((dcomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((dcomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_zdotv_zen_int5 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return rho; +} + + +scomplex cdotc_ + ( + const f77_int* n, + const scomplex* x, const f77_int* incx, + const scomplex* y, const f77_int* incy + ) +{ + dim_t n0; + scomplex* x0; + scomplex* y0; + inc_t incx0; + inc_t incy0; + scomplex rho; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy); + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((scomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((scomplex*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((scomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((scomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_cdotv_zen_int5 + ( + BLIS_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return rho; +} + +dcomplex zdotc_ + ( + const f77_int* n, + const dcomplex* x, const f77_int* incx, + const dcomplex* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy); + dim_t n0; + dcomplex* x0; + dcomplex* y0; + inc_t incx0; + inc_t incy0; + dcomplex rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((dcomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((dcomplex*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((dcomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((dcomplex*)y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) + { + /* Call BLIS kernel. */ + bli_zdotv_zen_int5 + ( + BLIS_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) + ( + BLIS_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL, + NULL + ); + } + + + + + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return rho; +} + +#else // BLIS_DISABLE_COMPLEX_RETURN_INTEL +// For the "intel" complex return type, use a hidden parameter to return the result +#undef GENTFUNCDOT +#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ +\ +void PASTEF772(ch,blasname,chc) \ + ( \ + ftype* rhop, \ + const f77_int* n, \ + const ftype* x, const f77_int* incx, \ + const ftype* y, const f77_int* incy \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ + ftype rho; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_conjx, \ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + &rho, \ + NULL, \ + NULL \ + ); \ +\ + /* Finalize BLIS. */ \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + bli_finalize_auto(); \ +\ + *rhop = rho; \ +} + +INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) +#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL + + + +// -- "Black sheep" dot product function definitions -- + +// Input vectors stored in single precision, computed in double precision, +// with result returned in single precision. +float PASTEF77(sd,sdot) + ( + const f77_int* n, + const float* sb, + const float* x, const f77_int* incx, + const float* y, const f77_int* incy + ) +{ + return ( float ) + ( + ( double )(*sb) + + PASTEF77(d,sdot) + ( + n, + x, incx, + y, incy + ) + ); +} + +// Input vectors stored in single precision, computed in double precision, +// with result returned in double precision. +double PASTEF77(d,sdot) + ( + const f77_int* n, + const float* x, const f77_int* incx, + const float* y, const f77_int* incy + ) +{ + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + double rho; + dim_t i; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy); + /* Initialization of BLIS is not required. */ + + /* Convert/typecast negative values of n to zero. */ + bli_convert_blas_dim1( *n, n0 ); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 ); + bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 ); + + rho = 0.0; + + for ( i = 0; i < n0; i++ ) + { + float* chi1 = x0 + (i )*incx0; + float* psi1 = y0 + (i )*incy0; + + bli_ddots( (( double )(*chi1)), + (( double )(*psi1)), rho ); + } + + /* Finalization of BLIS is not required, because initialization was + not required. */ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + + return rho; +} + +#endif diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c index 80ad197c6..8d08a9e01 100644 --- a/frame/compat/bla_gemm.c +++ b/frame/compat/bla_gemm.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -300,512 +300,7 @@ void PASTEF77(ch,blasname) \ #endif #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC -void dgemm_ -( - const f77_char* transa, - const f77_char* transb, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const double* alpha, - const double* a, const f77_int* lda, - const double* b, const f77_int* ldb, - const double* beta, - double* c, const f77_int* ldc -) -{ - - - - trans_t blis_transa; - trans_t blis_transb; - dim_t m0, n0, k0; - - /* Initialize BLIS. */ - bli_init_auto(); - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \ - (void*)alpha, *lda, *ldb, (void*)beta, *ldc); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemm) - ( - MKSTR(d), - MKSTR(gemm), - transa, - transb, - m, - n, - k, - lda, - ldb, - ldc - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_trans(*transa, &blis_transa); - bli_param_map_netlib_to_blis_trans(*transb, &blis_transb); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1(*m, m0); - bli_convert_blas_dim1(*n, n0); - bli_convert_blas_dim1(*k, k0); - - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const inc_t rs_c = 1; - const inc_t cs_c = *ldc; - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (!bamdzen) - { - // This code is duplicated below, however we don't want to move it out of - // this IF block as it will affect the performance on Zen architetures - // Also this is temporary fix which will be replaced later. - const num_t dt = BLIS_DOUBLE; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t co = BLIS_OBJECT_INITIALIZER; - - dim_t m0_a, n0_a; - dim_t m0_b, n0_b; - - bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a); - bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b); - - bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao); - bli_obj_init_finish_1x1(dt, (double *)beta, &betao); - - bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao); - bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo); - bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co); - - bli_obj_set_conjtrans(blis_transa, &ao); - bli_obj_set_conjtrans(blis_transb, &bo); - - // Will call parallelized dgemm code - sup & native - PASTEMAC(gemm, BLIS_OAPI_EX_SUF) - ( - &alphao, - &ao, - &bo, - &betao, - &co, - NULL, - NULL - ); - - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - - if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) - { - bli_dgemm_ref_k1_nn( m0, n0, k0, - (double*)alpha, - (double*)a, *lda, - (double*)b, *ldb, - (double*)beta, - c, *ldc - ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS */ - bli_finalize_auto(); - - return; - } - - if (n0 == 1) - { - if (bli_is_notrans(blis_transa)) - { - bli_dgemv_unf_var2( - BLIS_NO_TRANSPOSE, - bli_extract_conj(blis_transb), - m0, k0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, - (double*)beta, - c, rs_c, - ((void*)0) - ); - } - else - { - bli_dgemv_unf_var1( - blis_transa, - bli_extract_conj(blis_transb), - k0, m0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, - (double*)beta, - c, rs_c, - ((void*)0) - ); - } - - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - - return; - } - else if (m0 == 1) - { - if (bli_is_notrans(blis_transb)) - { - bli_dgemv_unf_var1( - blis_transb, - bli_extract_conj(blis_transa), - n0, k0, - (double*)alpha, - (double*)b, cs_b, rs_b, - (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, - (double*)beta, - c, cs_c, - ((void*)0) - ); - } - else - { - bli_dgemv_unf_var2( - blis_transb, - bli_extract_conj(blis_transa), - k0, n0, - (double*)alpha, - (double*)b, cs_b, rs_b, - (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, - (double*)beta, - c, cs_c, - ((void*)0) - ); - } - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - return; - } - - const num_t dt = BLIS_DOUBLE; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t co = BLIS_OBJECT_INITIALIZER; - - dim_t m0_a, n0_a; - dim_t m0_b, n0_b; - - bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a); - bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b); - - bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao); - bli_obj_init_finish_1x1(dt, (double*)beta, &betao); - - bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao); - bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo); - bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co); - - bli_obj_set_conjtrans(blis_transa, &ao); - bli_obj_set_conjtrans(blis_transb, &bo); - - //cntx_t* cntx = bli_gks_query_cntx(); - //dim_t nt = bli_thread_get_num_threads(); // get number of threads - bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked. - - // if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better - // - -#ifdef AOCL_DYNAMIC - if (nt && ((n0 > 10 ) || (k0 > 10)) ) -#else - if (nt) -#endif - { - // Will call parallelized dgemm code - sup & native - PASTEMAC(gemm, BLIS_OAPI_EX_SUF) - ( - &alphao, - &ao, - &bo, - &betao, - &co, - NULL, - NULL - ); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - - // The code below will be called when number of threads = 1. - -#ifdef BLIS_ENABLE_SMALL_MATRIX - - //if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2)) - if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) || - ((n0 <= 10) && (k0 <=10)) ) - { - err_t status; - if (bli_is_notrans(blis_transa)) - { - status = bli_dgemm_small( &alphao, - &ao, - &bo, - &betao, - &co, - NULL, //cntx, - NULL - ); - } - else - { - status = bli_dgemm_small_At ( &alphao, - &ao, - &bo, - &betao, - &co, - NULL, //cntx, - NULL - ); - } - - if (status == BLIS_SUCCESS) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - - return; - } - } - -#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX - - err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - if (status == BLIS_SUCCESS) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - return; - } - - // fall back on native path when dgemm is not handled in sup path. - bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - - - /* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */ - /* ( */ - /* &alphao, */ - /* &ao, */ - /* &bo, */ - /* &betao, */ - /* &co, */ - /* NULL, */ - /* NULL */ - /* ); */ - - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); -} // end of dgemm_ - -void zgemm_ - ( - const f77_char* transa, - const f77_char* transb, - const f77_int* m, - const f77_int* n, - const f77_int* k, - const dcomplex* alpha, - const dcomplex* a, const f77_int* lda, - const dcomplex* b, const f77_int* ldb, - const dcomplex* beta, - dcomplex* c, const f77_int* ldc - ) -{ - trans_t blis_transa; - trans_t blis_transb; - dim_t m0, n0, k0; - - /* Initialize BLIS. */ - bli_init_auto(); - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k, - (void*)alpha, *lda, *ldb, (void*)beta, *ldc); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemm) - ( - MKSTR(z), - MKSTR(gemm), - transa, - transb, - m, - n, - k, - lda, - ldb, - ldc - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); - bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1( *m, m0 ); - bli_convert_blas_dim1( *n, n0 ); - bli_convert_blas_dim1( *k, k0 ); - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const inc_t rs_c = 1; - const inc_t cs_c = *ldc; - - const num_t dt = BLIS_DCOMPLEX; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t co = BLIS_OBJECT_INITIALIZER; - - dim_t m0_a, n0_a; - dim_t m0_b, n0_b; - - bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); - bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); - - bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); - bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); - - bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); - bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); - - bli_obj_set_conjtrans( blis_transa, &ao ); - bli_obj_set_conjtrans( blis_transb, &bo ); - - // default instance peformance tuning is done in zgemm. - // Single instance tuning is done based on env set. - dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 ); - - //dim_t nt = bli_thread_get_num_threads(); // get number of threads - bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. - if ( nt ) - { - // Will call parallelized zgemm code - sup & native - PASTEMAC(gemm, BLIS_OAPI_EX_SUF) - ( - &alphao, - &ao, - &bo, - &betao, - &co, - NULL, - NULL - ); - - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - - // The code below will be called when number of threads = 1. -#if ENABLE_INDUCED_METHOD - /* 3m_sqp is optimal for certain matrix shapes. - Initial study that it works well for square sizes and sizes closer to square shape. - - * Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method. - * Further investigation is necessary to make the usage choices more generic. */ - bool sqp_on = false; - if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) ) - { - sqp_on = true; - } - - // current range of sizes used for 3m_sqp to be expaned after evaluation. - if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) ) - && ( k0 == 1120 ) ) //to be tuned further. - { - sqp_on = true; - } - - if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) ) - { - //sqp algo is found better for n > 40 - if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - return; - } - } -#endif//ENABLE_INDUCED_METHOD - -// native tuning resulted in better numbers compared to sup in constrained multi-instance -// sup has been enabled for single instance cases. - if(single_instance==1) - { - err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - if(status==BLIS_SUCCESS) - { - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - return; - } - - } - // fall back on native path when zgemm is not handled in sup path. - bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); - AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - return; - - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - /* Finalize BLIS. */ - bli_finalize_auto(); -}// end of zgemm_ - - -INSERT_GENTFUNC_BLAS_SC( gemm, gemm ) -#else INSERT_GENTFUNC_BLAS( gemm,gemm ) -#endif // Observed a regression in dgemm with this function addition. // Disabling temporarily. diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c new file mode 100644 index 000000000..7ef58bfb3 --- /dev/null +++ b/frame/compat/bla_gemm_amd.c @@ -0,0 +1,894 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-to-BLIS interfaces. +// +#define ENABLE_INDUCED_METHOD 0 +#ifdef BLIS_BLAS3_CALLS_TAPI + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ) \ +{ \ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ + inc_t rs_a, cs_a; \ + inc_t rs_b, cs_b; \ + inc_t rs_c, cs_c; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \ + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + transb, \ + m, \ + n, \ + k, \ + lda, \ + ldb, \ + ldc \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ + bli_convert_blas_dim1( *k, k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + rs_a = 1; \ + cs_a = *lda; \ + rs_b = 1; \ + cs_b = *ldb; \ + rs_c = 1; \ + cs_c = *ldc; \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + blis_transb, \ + m0, \ + n0, \ + k0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, rs_b, cs_b, \ + (ftype*)beta, \ + (ftype*)c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#else + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ) \ +{ \ +\ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ +\ + dim_t m0_a, n0_a; \ + dim_t m0_b, n0_b; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \ + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + transb, \ + m, \ + n, \ + k, \ + lda, \ + ldb, \ + ldc \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ + bli_convert_blas_dim1( *k, k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ +\ + if( n0 == 1 ) \ + { \ + if(bli_is_notrans(blis_transa)) \ + { \ + PASTEMAC(ch,gemv_unf_var2)( \ + BLIS_NO_TRANSPOSE, \ + bli_extract_conj(blis_transb), \ + m0, k0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a,\ + (ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \ + (ftype*) beta, \ + c, rs_c, \ + NULL \ + ); \ + } \ + else \ + { \ + PASTEMAC(ch,gemv_unf_var1)( \ + blis_transa, \ + bli_extract_conj(blis_transb), \ + k0, m0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \ + (ftype*)beta, \ + c, rs_c, \ + NULL \ + ); \ + } \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + return; \ + } \ + else if( m0 == 1 ) \ + { \ + if(bli_is_notrans(blis_transb)) \ + { \ + PASTEMAC(ch,gemv_unf_var1)( \ + blis_transb, \ + bli_extract_conj(blis_transa), \ + n0, k0, \ + (ftype*)alpha, \ + (ftype*)b, cs_b, rs_b, \ + (ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \ + (ftype*)beta, \ + c, cs_c, \ + NULL \ + ); \ + } \ + else \ + { \ + PASTEMAC(ch,gemv_unf_var2)( \ + blis_transb, \ + bli_extract_conj(blis_transa), \ + k0, n0, \ + (ftype*)alpha, \ + (ftype*)b, cs_b, rs_b, \ + (ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \ + (ftype*)beta, \ + c, cs_c, \ + NULL \ + ); \ + } \ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + return; \ + } \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ +\ + bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ +\ + bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( blis_transa, &ao ); \ + bli_obj_set_conjtrans( blis_transb, &bo ); \ +\ + PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} +#endif + +#ifdef BLIS_ENABLE_BLAS +void dgemm_ +( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const double* alpha, + const double* a, const f77_int* lda, + const double* b, const f77_int* ldb, + const double* beta, + double* c, const f77_int* ldc +) +{ + + + + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; + + /* Initialize BLIS. */ + bli_init_auto(); + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \ + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm) + ( + MKSTR(d), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + lda, + ldb, + ldc + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans(*transa, &blis_transa); + bli_param_map_netlib_to_blis_trans(*transb, &blis_transb); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1(*m, m0); + bli_convert_blas_dim1(*n, n0); + bli_convert_blas_dim1(*k, k0); + + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const inc_t rs_c = 1; + const inc_t cs_c = *ldc; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + // This code is duplicated below, however we don't want to move it out of + // this IF block as it will affect the performance on Zen architetures + // Also this is temporary fix which will be replaced later. + const num_t dt = BLIS_DOUBLE; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a); + bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b); + + bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao); + bli_obj_init_finish_1x1(dt, (double *)beta, &betao); + + bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao); + bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo); + bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co); + + bli_obj_set_conjtrans(blis_transa, &ao); + bli_obj_set_conjtrans(blis_transb, &bo); + + // Will call parallelized dgemm code - sup & native + PASTEMAC(gemm, BLIS_OAPI_EX_SUF) + ( + &alphao, + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + + if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) + { + bli_dgemm_ref_k1_nn( m0, n0, k0, + (double*)alpha, + (double*)a, *lda, + (double*)b, *ldb, + (double*)beta, + c, *ldc + ); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS */ + bli_finalize_auto(); + + return; + } + + if (n0 == 1) + { + if (bli_is_notrans(blis_transa)) + { + bli_dgemv_unf_var2( + BLIS_NO_TRANSPOSE, + bli_extract_conj(blis_transb), + m0, k0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (double*)beta, + c, rs_c, + ((void*)0) + ); + } + else + { + bli_dgemv_unf_var1( + blis_transa, + bli_extract_conj(blis_transb), + k0, m0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b, + (double*)beta, + c, rs_c, + ((void*)0) + ); + } + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + + return; + } + else if (m0 == 1) + { + if (bli_is_notrans(blis_transb)) + { + bli_dgemv_unf_var1( + blis_transb, + bli_extract_conj(blis_transa), + n0, k0, + (double*)alpha, + (double*)b, cs_b, rs_b, + (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (double*)beta, + c, cs_c, + ((void*)0) + ); + } + else + { + bli_dgemv_unf_var2( + blis_transb, + bli_extract_conj(blis_transa), + k0, n0, + (double*)alpha, + (double*)b, cs_b, rs_b, + (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a, + (double*)beta, + c, cs_c, + ((void*)0) + ); + } + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + return; + } + + const num_t dt = BLIS_DOUBLE; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a); + bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b); + + bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao); + bli_obj_init_finish_1x1(dt, (double*)beta, &betao); + + bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao); + bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo); + bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co); + + bli_obj_set_conjtrans(blis_transa, &ao); + bli_obj_set_conjtrans(blis_transb, &bo); + + //cntx_t* cntx = bli_gks_query_cntx(); + //dim_t nt = bli_thread_get_num_threads(); // get number of threads + bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked. + + // if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better + // + +#ifdef AOCL_DYNAMIC + if (nt && ((n0 > 10 ) || (k0 > 10)) ) +#else + if (nt) +#endif + { + // Will call parallelized dgemm code - sup & native + PASTEMAC(gemm, BLIS_OAPI_EX_SUF) + ( + &alphao, + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + + // The code below will be called when number of threads = 1. + +#ifdef BLIS_ENABLE_SMALL_MATRIX + + //if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2)) + if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) || + ((n0 <= 10) && (k0 <=10)) ) + { + err_t status; + if (bli_is_notrans(blis_transa)) + { + status = bli_dgemm_small( &alphao, + &ao, + &bo, + &betao, + &co, + NULL, //cntx, + NULL + ); + } + else + { + status = bli_dgemm_small_At ( &alphao, + &ao, + &bo, + &betao, + &co, + NULL, //cntx, + NULL + ); + } + + if (status == BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + + return; + } + } + +#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX + + err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + if (status == BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + return; + } + + // fall back on native path when dgemm is not handled in sup path. + bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + + + /* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */ + /* ( */ + /* &alphao, */ + /* &ao, */ + /* &bo, */ + /* &betao, */ + /* &co, */ + /* NULL, */ + /* NULL */ + /* ); */ + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); +} // end of dgemm_ + +void zgemm_ + ( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const dcomplex* alpha, + const dcomplex* a, const f77_int* lda, + const dcomplex* b, const f77_int* ldb, + const dcomplex* beta, + dcomplex* c, const f77_int* ldc + ) +{ + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; + + /* Initialize BLIS. */ + bli_init_auto(); + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k, + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm) + ( + MKSTR(z), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + lda, + ldb, + ldc + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + bli_convert_blas_dim1( *k, k0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const inc_t rs_c = 1; + const inc_t cs_c = *ldc; + + const num_t dt = BLIS_DCOMPLEX; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); + bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); + + bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); + + // default instance peformance tuning is done in zgemm. + // Single instance tuning is done based on env set. + dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 ); + + //dim_t nt = bli_thread_get_num_threads(); // get number of threads + bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked. + if ( nt ) + { + // Will call parallelized zgemm code - sup & native + PASTEMAC(gemm, BLIS_OAPI_EX_SUF) + ( + &alphao, + &ao, + &bo, + &betao, + &co, + NULL, + NULL + ); + + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + + // The code below will be called when number of threads = 1. +#if ENABLE_INDUCED_METHOD + /* 3m_sqp is optimal for certain matrix shapes. + Initial study that it works well for square sizes and sizes closer to square shape. + + * Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method. + * Further investigation is necessary to make the usage choices more generic. */ + bool sqp_on = false; + if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) ) + { + sqp_on = true; + } + + // current range of sizes used for 3m_sqp to be expaned after evaluation. + if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) ) + && ( k0 == 1120 ) ) //to be tuned further. + { + sqp_on = true; + } + + if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) ) + { + //sqp algo is found better for n > 40 + if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + return; + } + } +#endif//ENABLE_INDUCED_METHOD + +// native tuning resulted in better numbers compared to sup in constrained multi-instance +// sup has been enabled for single instance cases. + if(single_instance==1) + { + err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + if(status==BLIS_SUCCESS) + { + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + return; + } + + } + // fall back on native path when zgemm is not handled in sup path. + bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + return; + + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + bli_finalize_auto(); +}// end of zgemm_ + + +INSERT_GENTFUNC_BLAS_SC( gemm, gemm ) + + +// Observed a regression in dgemm with this function addition. +// Disabling temporarily. +#if 0 +void dzgemm_ + ( + const f77_char* transa, + const f77_char* transb, + const f77_int* m, + const f77_int* n, + const f77_int* k, + const dcomplex* alpha, + const double* a, const f77_int* lda, + const dcomplex* b, const f77_int* ldb, + const dcomplex* beta, + dcomplex* c, const f77_int* ldc + ) +{ + + trans_t blis_transa; + trans_t blis_transb; + dim_t m0, n0, k0; + + /* Initialize BLIS. */ + bli_init_auto(); + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k, + (void*)alpha, *lda, *ldb, (void*)beta, *ldc); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemm) + ( + MKSTR(z), + MKSTR(gemm), + transa, + transb, + m, + n, + k, + lda, + ldb, + ldc + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + bli_convert_blas_dim1( *k, k0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const inc_t rs_c = 1; + const inc_t cs_c = *ldc; + + const num_t dt = BLIS_DCOMPLEX; + const num_t dt_a = BLIS_DOUBLE; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t co = BLIS_OBJECT_INITIALIZER; + + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); + + bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); + bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao ); + + bli_obj_init_finish( dt_a, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo ); + bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co ); + + bli_obj_set_conjtrans( blis_transa, &ao ); + bli_obj_set_conjtrans( blis_transb, &bo ); + + // fall back on native path when zgemm is not handled in sup path. + bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL); + + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + bli_finalize_auto(); +}// end of dzgemm_ +#endif +#endif diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c index af2745ca9..9dba1b43c 100644 --- a/frame/compat/bla_gemv.c +++ b/frame/compat/bla_gemv.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -147,856 +147,5 @@ void PASTEF77(ch,blasname) \ #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC -void dgemv_ - ( - const f77_char* transa, - const f77_int* m, - const f77_int* n, - const double* alpha, - const double* a, const f77_int* lda, - const double* x, const f77_int* incx, - const double* beta, - double* y, const f77_int* incy - ) -{ - trans_t blis_transa; - dim_t m0, n0; - dim_t m_y, n_x; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; - inc_t rs_a, cs_a; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemv) - ( - MKSTR(d), - MKSTR(gemv), - transa, - m, - n, - lda, - incx, - incy - ); - - if (*m == 0 || *n == 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; - else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; - else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - //bli_check_error_code( BLIS_INVALID_TRANS ); - blis_transa = BLIS_NO_TRANSPOSE; - } - - /* Convert/typecast negative values of m and n to zero. */ - if ( *m < 0 ) m0 = ( dim_t )0; - else m0 = ( dim_t )(*m); - - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* Determine the dimensions of x and y so we can adjust the increments, - if necessary.*/ - if ( bli_does_notrans( blis_transa ) ) - { - m_y = m0; - n_x = n0; - } - else - { - m_y = n0; - n_x = m0; - } - - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to - provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ - if ( m_y > 0 && n_x == 0 ) - { - /* Finalize BLIS. */ - // bli_finalize_auto(); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - x0 = ((double*)x) + (n_x-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((double*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((double*)y) + (m_y-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((double*)y); - incy0 = ( inc_t )(*incy); - } - - /* Set the row and column strides of A. */ - rs_a = 1; - cs_a = *lda; - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - /* Call BLIS interface. */ - PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF) - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (double*)alpha, - (double*)a, rs_a, cs_a, - x0, incx0, - (double*)beta, - y0, incy0, - NULL, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Call variants based on transpose value. */ - if(bli_does_notrans(blis_transa)) - { - //variant_2 is chosen for column-storage - // and uses axpyf-based implementation - bli_dgemv_unf_var2 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (double*)alpha, - (double*)a, rs_a, cs_a, - x0, incx0, - (double*)beta, - y0, incy0, - NULL - ); - } - else - { - //var_1 is chosen for row-storage - //and uses dotxf-based implementation - bli_dgemv_unf_var1 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (double*)alpha, - (double*)a, rs_a, cs_a, - x0, incx0, - (double*)beta, - y0, incy0, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); -} - -void sgemv_ - ( - const f77_char* transa, - const f77_int* m, - const f77_int* n, - const float* alpha, - const float* a, const f77_int* lda, - const float* x, const f77_int* incx, - const float* beta, - float* y, const f77_int* incy - ) -{ - trans_t blis_transa; - dim_t m0, n0; - dim_t m_y, n_x; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; - inc_t rs_a, cs_a; - - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemv) - ( - MKSTR(s), - MKSTR(gemv), - transa, - m, - n, - lda, - incx, - incy - ); - - if (*m == 0 || *n == 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; - else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; - else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - //bli_check_error_code( BLIS_INVALID_TRANS ); - blis_transa = BLIS_NO_TRANSPOSE; - } - - /* Convert/typecast negative values of m and n to zero. */ - if ( *m < 0 ) m0 = ( dim_t )0; - else m0 = ( dim_t )(*m); - - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* Determine the dimensions of x and y so we can adjust the increments, - if necessary.*/ - if ( bli_does_notrans( blis_transa ) ) - { - m_y = m0; - n_x = n0; - } - else - { - m_y = n0; - n_x = m0; - } - - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to - provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ - if ( m_y > 0 && n_x == 0 ) - { - /* Finalize BLIS. */ - // bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - x0 = ((float*)x) + (n_x-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((float*)x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((float*)y) + (m_y-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((float*)y); - incy0 = ( inc_t )(*incy); - } - - /* Set the row and column strides of A. */ - rs_a = 1; - cs_a = *lda; - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen == 0) - { - /* Call BLIS interface. */ - PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF) - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (float*)alpha, - (float*)a, rs_a, cs_a, - x0, incx0, - (float*)beta, - y0, incy0, - NULL, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Call variants based on transpose value. */ - if(bli_does_notrans(blis_transa)) - { - bli_sgemv_unf_var2 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (float*)alpha, - (float*)a, rs_a, cs_a, - x0, incx0, - (float*)beta, - y0, incy0, - NULL - ); - } - else - { - bli_sgemv_unf_var1 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (float*)alpha, - (float*)a, rs_a, cs_a, - x0, incx0, - (float*)beta, - y0, incy0, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); -} - - -void cgemv_ - ( - const f77_char* transa, - const f77_int* m, - const f77_int* n, - const scomplex* alpha, - const scomplex* a, const f77_int* lda, - const scomplex* x, const f77_int* incx, - const scomplex* beta, - scomplex* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - - trans_t blis_transa; - dim_t m0, n0; - dim_t m_y, n_x; - scomplex* x0; - scomplex* y0; - inc_t incx0; - inc_t incy0; - inc_t rs_a, cs_a; - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemv) - ( - MKSTR(c), - MKSTR(gemv), - transa, - m, - n, - lda, - incx, - incy - ); - - if (*m == 0 || *n == 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; - else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; - else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - // bli_check_error_code( BLIS_INVALID_TRANS ); - blis_transa = BLIS_NO_TRANSPOSE; - } - - /* Convert/typecast negative values of m and n to zero. */ - if( *m < 0 ) m0 = (dim_t)0; - else m0 = (dim_t)(*m); - - if( *n < 0 ) n0 = (dim_t)0; - else n0 = (dim_t)(*n); - - /* Determine the dimensions of x and y so we can adjust the increments, - if necessary.*/ - if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; } - else { m_y = n0; n_x = m0; } - - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to - provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ - - if ( m_y > 0 && n_x == 0 ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if( *incx < 0 ) - { - x0 = ((scomplex*)x) + (n_x-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((scomplex*)x); - incx0 = (inc_t)(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((scomplex*)y) + (m_y-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((scomplex*)y); - incy0 = ( inc_t )(*incy); - } - - /* Set the row and column strides of A. */ - rs_a = 1; - cs_a = *lda; - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if( m_y == 1 ) - { - conj_t conja = bli_extract_conj(blis_transa); - scomplex rho; - if (bamdzen) - { - bli_cdotv_zen_int5 - ( - conja, - BLIS_NO_CONJUGATE, - n_x, - (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, - x0, incx0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) - ( - conja, - BLIS_NO_CONJUGATE, - n_x, - (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, - x0, incx0, - &rho, - NULL, - NULL - ); - } - - scomplex yval = *y0; - if(!bli_ceq0(*beta)) - { - bli_cscals( *beta, yval ); - } - else - { - bli_csetsc( 0.0, 0.0, &yval); - } - if(!bli_ceq0(*alpha)) - { - bli_caxpys( *alpha, rho, yval); - } - y0->real = yval.real; - y0->imag = yval.imag; - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - if (bamdzen == 0) - { - /* Call BLIS interface. */ - PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF) - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (scomplex*)alpha, - (scomplex*)a, rs_a, cs_a, - x0, incx0, - (scomplex*)beta, - y0, incy0, - NULL, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* call variants based on transpose value */ - if( bli_does_notrans( blis_transa ) ) - { - bli_cgemv_unf_var2 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (scomplex*)alpha, - (scomplex*)a, rs_a, cs_a, - x0, incx0, - (scomplex*)beta, - y0, incy0, - NULL - ); - } - else - { - bli_cgemv_unf_var1 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (scomplex*)alpha, - (scomplex*)a, rs_a, cs_a, - x0, incx0, - (scomplex*)beta, - y0, incy0, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); -} - - -void zgemv_ - ( - const f77_char* transa, - const f77_int* m, - const f77_int* n, - const dcomplex* alpha, - const dcomplex* a, const f77_int* lda, - const dcomplex* x, const f77_int* incx, - const dcomplex* beta, - dcomplex* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); - AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); - - trans_t blis_transa; - dim_t m0, n0; - dim_t m_y, n_x; - dcomplex* x0; - dcomplex* y0; - inc_t incx0; - inc_t incy0; - inc_t rs_a, cs_a; - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(gemv) - ( - MKSTR(z), - MKSTR(gemv), - transa, - m, - n, - lda, - incx, - incy - ); - - if (*m == 0 || *n == 0) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; - else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; - else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; - else - { - // See comment for bli_param_map_netlib_to_blis_side() above. - // bli_check_error_code( BLIS_INVALID_TRANS ); - blis_transa = BLIS_NO_TRANSPOSE; - } - - /* Convert/typecast negative values of m and n to zero. */ - if( *m < 0 ) m0 = (dim_t)0; - else m0 = (dim_t)(*m); - - if( *n < 0 ) n0 = (dim_t)0; - else n0 = (dim_t)(*n); - - /* Determine the dimensions of x and y so we can adjust the increments, - if necessary.*/ - if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; } - else { m_y = n0; n_x = m0; } - - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to - provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ - - if ( m_y > 0 && n_x == 0 ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if( *incx < 0 ) - { - x0 = ((dcomplex*)x) + (n_x-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((dcomplex*)x); - incx0 = (inc_t)(*incx); - } - - if ( *incy < 0 ) - { - y0 = ((dcomplex*)y) + (m_y-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((dcomplex*)y); - incy0 = ( inc_t )(*incy); - } - - /* Set the row and column strides of A. */ - rs_a = 1; - cs_a = *lda; - - // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration. - // This function is invoked on all architectures including ‘generic’. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if( m_y == 1 ) - { - conj_t conja = bli_extract_conj(blis_transa); - dcomplex rho; - - if (bamdzen) - { - bli_zdotv_zen_int5 - ( - conja, - BLIS_NO_CONJUGATE, - n_x, - (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, - x0, incx0, - &rho, - NULL - ); - } - else - { - /* Call BLIS interface. */ - PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) - ( - conja, - BLIS_NO_CONJUGATE, - n_x, - (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, - x0, incx0, - &rho, - NULL, - NULL - ); - } - - dcomplex yval = *y0; - if(!bli_zeq0(*beta)) - { - bli_zscals( *beta, yval ); - } - else - { - bli_zsetsc( 0.0, 0.0, &yval); - } - if(!bli_zeq0(*alpha)) - { - bli_zaxpys( *alpha, rho, yval); - } - y0->real = yval.real; - y0->imag = yval.imag; - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - if (bamdzen == 0) - { - /* Call BLIS interface. */ - PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF) - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (dcomplex*)alpha, - (dcomplex*)a, rs_a, cs_a, - x0, incx0, - (dcomplex*)beta, - y0, incy0, - NULL, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* call variants based on transpose value */ - if( bli_does_notrans( blis_transa ) ) - { - bli_zgemv_unf_var2 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (dcomplex*)alpha, - (dcomplex*)a, rs_a, cs_a, - x0, incx0, - (dcomplex*)beta, - y0, incy0, - NULL - ); - } - else - { - bli_zgemv_unf_var1 - ( - blis_transa, - BLIS_NO_CONJUGATE, - m0, - n0, - (dcomplex*)alpha, - (dcomplex*)a, rs_a, cs_a, - x0, incx0, - (dcomplex*)beta, - y0, incy0, - NULL - ); - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); -} - - -#else INSERT_GENTFUNC_BLAS( gemv, gemv ) #endif -#endif diff --git a/frame/compat/bla_gemv_amd.c b/frame/compat/bla_gemv_amd.c new file mode 100644 index 000000000..354f45fe1 --- /dev/null +++ b/frame/compat/bla_gemv_amd.c @@ -0,0 +1,963 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_int* m, \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* x, const f77_int* incx, \ + const ftype* beta, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \ + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \ + trans_t blis_transa; \ + dim_t m0, n0; \ + dim_t m_y, n_x; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ + inc_t rs_a, cs_a; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + m, \ + n, \ + lda, \ + incx, \ + incy \ + ); \ +\ + if (*m == 0 || *n == 0) { \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + return; \ + } \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ +\ + /* Convert/typecast negative values of m and n to zero. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ \ + bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ +\ + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ \ + if ( m_y > 0 && n_x == 0 ) \ + { \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +\ + return; \ + } \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Set the row and column strides of A. */ \ + rs_a = 1; \ + cs_a = *lda; \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + BLIS_NO_CONJUGATE, \ + m0, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + x0, incx0, \ + (ftype*)beta, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + + +#ifdef BLIS_ENABLE_BLAS +void dgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const double* alpha, + const double* a, const f77_int* lda, + const double* x, const f77_int* incx, + const double* beta, + double* y, const f77_int* incy + ) +{ + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(d), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + if (*m == 0 || *n == 0) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + //bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if ( *m < 0 ) m0 = ( dim_t )0; + else m0 = ( dim_t )(*m); + + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if ( bli_does_notrans( blis_transa ) ) + { + m_y = m0; + n_x = n0; + } + else + { + m_y = n0; + n_x = m0; + } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + if ( m_y > 0 && n_x == 0 ) + { + /* Finalize BLIS. */ + // bli_finalize_auto(); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + x0 = ((double*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((double*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + /* Call BLIS interface. */ + PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF) + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (double*)alpha, + (double*)a, rs_a, cs_a, + x0, incx0, + (double*)beta, + y0, incy0, + NULL, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Call variants based on transpose value. */ + if(bli_does_notrans(blis_transa)) + { + //variant_2 is chosen for column-storage + // and uses axpyf-based implementation + bli_dgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (double*)alpha, + (double*)a, rs_a, cs_a, + x0, incx0, + (double*)beta, + y0, incy0, + NULL + ); + } + else + { + //var_1 is chosen for row-storage + //and uses dotxf-based implementation + bli_dgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (double*)alpha, + (double*)a, rs_a, cs_a, + x0, incx0, + (double*)beta, + y0, incy0, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + +void sgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const float* alpha, + const float* a, const f77_int* lda, + const float* x, const f77_int* incx, + const float* beta, + float* y, const f77_int* incy + ) +{ + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(s), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + if (*m == 0 || *n == 0) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + //bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if ( *m < 0 ) m0 = ( dim_t )0; + else m0 = ( dim_t )(*m); + + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if ( bli_does_notrans( blis_transa ) ) + { + m_y = m0; + n_x = n0; + } + else + { + m_y = n0; + n_x = m0; + } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + if ( m_y > 0 && n_x == 0 ) + { + /* Finalize BLIS. */ + // bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + x0 = ((float*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((float*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) + { + /* Call BLIS interface. */ + PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF) + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (float*)alpha, + (float*)a, rs_a, cs_a, + x0, incx0, + (float*)beta, + y0, incy0, + NULL, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Call variants based on transpose value. */ + if(bli_does_notrans(blis_transa)) + { + bli_sgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (float*)alpha, + (float*)a, rs_a, cs_a, + x0, incx0, + (float*)beta, + y0, incy0, + NULL + ); + } + else + { + bli_sgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (float*)alpha, + (float*)a, rs_a, cs_a, + x0, incx0, + (float*)beta, + y0, incy0, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + + +void cgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const scomplex* alpha, + const scomplex* a, const f77_int* lda, + const scomplex* x, const f77_int* incx, + const scomplex* beta, + scomplex* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + scomplex* x0; + scomplex* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(c), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + if (*m == 0 || *n == 0) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + // bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if( *m < 0 ) m0 = (dim_t)0; + else m0 = (dim_t)(*m); + + if( *n < 0 ) n0 = (dim_t)0; + else n0 = (dim_t)(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; } + else { m_y = n0; n_x = m0; } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + + if ( m_y > 0 && n_x == 0 ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if( *incx < 0 ) + { + x0 = ((scomplex*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((scomplex*)x); + incx0 = (inc_t)(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((scomplex*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((scomplex*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + if( m_y == 1 ) + { + conj_t conja = bli_extract_conj(blis_transa); + scomplex rho; + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_cdotv_zen_int5 + ( + conja, + BLIS_NO_CONJUGATE, + n_x, + (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, + x0, incx0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF) + ( + conja, + BLIS_NO_CONJUGATE, + n_x, + (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, + x0, incx0, + &rho, + NULL, + NULL + ); + } + + scomplex yval = *y0; + if(!bli_ceq0(*beta)) + { + bli_cscals( *beta, yval ); + } + else + { + bli_csetsc( 0.0, 0.0, &yval); + } + if(!bli_ceq0(*alpha)) + { + bli_caxpys( *alpha, rho, yval); + } + y0->real = yval.real; + y0->imag = yval.imag; + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + if (bli_cpuid_is_avx_supported() == FALSE) + { + /* Call BLIS interface. */ + PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF) + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (scomplex*)alpha, + (scomplex*)a, rs_a, cs_a, + x0, incx0, + (scomplex*)beta, + y0, incy0, + NULL, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* call variants based on transpose value */ + if( bli_does_notrans( blis_transa ) ) + { + bli_cgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (scomplex*)alpha, + (scomplex*)a, rs_a, cs_a, + x0, incx0, + (scomplex*)beta, + y0, incy0, + NULL + ); + } + else + { + bli_cgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (scomplex*)alpha, + (scomplex*)a, rs_a, cs_a, + x0, incx0, + (scomplex*)beta, + y0, incy0, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + + +void zgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const dcomplex* alpha, + const dcomplex* a, const f77_int* lda, + const dcomplex* x, const f77_int* incx, + const dcomplex* beta, + dcomplex* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); + AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); + + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + dcomplex* x0; + dcomplex* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(z), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + if (*m == 0 || *n == 0) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + // bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if( *m < 0 ) m0 = (dim_t)0; + else m0 = (dim_t)(*m); + + if( *n < 0 ) n0 = (dim_t)0; + else n0 = (dim_t)(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; } + else { m_y = n0; n_x = m0; } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + + if ( m_y > 0 && n_x == 0 ) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if( *incx < 0 ) + { + x0 = ((dcomplex*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((dcomplex*)x); + incx0 = (inc_t)(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((dcomplex*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((dcomplex*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + if( m_y == 1 ) + { + conj_t conja = bli_extract_conj(blis_transa); + dcomplex rho; + + if (bli_cpuid_is_avx_supported() == TRUE) + { + bli_zdotv_zen_int5 + ( + conja, + BLIS_NO_CONJUGATE, + n_x, + (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, + x0, incx0, + &rho, + NULL + ); + } + else + { + /* Call BLIS interface. */ + PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF) + ( + conja, + BLIS_NO_CONJUGATE, + n_x, + (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, + x0, incx0, + &rho, + NULL, + NULL + ); + } + + dcomplex yval = *y0; + if(!bli_zeq0(*beta)) + { + bli_zscals( *beta, yval ); + } + else + { + bli_zsetsc( 0.0, 0.0, &yval); + } + if(!bli_zeq0(*alpha)) + { + bli_zaxpys( *alpha, rho, yval); + } + y0->real = yval.real; + y0->imag = yval.imag; + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + if (bli_cpuid_is_avx_supported() == FALSE) + { + /* Call BLIS interface. */ + PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF) + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (dcomplex*)alpha, + (dcomplex*)a, rs_a, cs_a, + x0, incx0, + (dcomplex*)beta, + y0, incy0, + NULL, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* call variants based on transpose value */ + if( bli_does_notrans( blis_transa ) ) + { + bli_zgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (dcomplex*)alpha, + (dcomplex*)a, rs_a, cs_a, + x0, incx0, + (dcomplex*)beta, + y0, incy0, + NULL + ); + } + else + { + bli_zgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (dcomplex*)alpha, + (dcomplex*)a, rs_a, cs_a, + x0, incx0, + (dcomplex*)beta, + y0, incy0, + NULL + ); + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); +} + + + +#endif diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c index ab63a3459..b9651577e 100644 --- a/frame/compat/bla_scal.c +++ b/frame/compat/bla_scal.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -93,179 +93,5 @@ void PASTEF772(chx,cha,blasname) \ } #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC - -void sscal_ - ( - const f77_int* n, - const float* alpha, - float* x, const f77_int* incx - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx ); - dim_t n0; - float* x0; - inc_t incx0; - /* Initialize BLIS. */ - //bli_init_auto(); - - if (*n == 0 || alpha == NULL) { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - /* Call BLIS kernel */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - bli_sscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n0, - (float *)alpha, - x0, incx0, - NULL - ); - } - else{ - PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE,\ - n0, \ - (float *)alpha,\ - x0, incx0,\ - NULL, \ - NULL \ - );\ - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) -} - -void dscal_ - ( - const f77_int* n, - const double* alpha, - double* x, const f77_int* incx - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx ); - dim_t n0; - double* x0; - inc_t incx0; - - /* Initialize BLIS */ - //bli_init_auto(); - - if (*n == 0 || alpha == NULL) { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; - } - - /* Convert typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - /* Call BLIS kernel */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen){ - bli_dscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n0, - (double*) alpha, - x0, incx0, - NULL - ); - } - else{ - PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE,\ - n0, \ - (double *)alpha,\ - x0, incx0,\ - NULL, \ - NULL \ - );\ - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) -} - -INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv ) -#else INSERT_GENTFUNCSCAL_BLAS( scal, scalv ) #endif -#endif diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c new file mode 100644 index 000000000..178776a14 --- /dev/null +++ b/frame/compat/bla_scal_amd.c @@ -0,0 +1,260 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNCSCAL +#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \ +\ +void PASTEF772(chx,cha,blasname) \ + ( \ + const f77_int* n, \ + const ftype_a* alpha, \ + ftype_x* x, const f77_int* incx \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ + dim_t n0; \ + ftype_x* x0; \ + inc_t incx0; \ + ftype_x alpha_cast; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + if (*n == 0 || alpha == NULL) { \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + return ; \ + } \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ +\ + /* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS. + that is, we just always sub-optimally implement those cases + by casting alpha to ctype_x (potentially the complex domain) and + using the homogeneous datatype instance according to that type. */ \ + PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + &alpha_cast, \ + x0, incx0, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#ifdef BLIS_ENABLE_BLAS + +void sscal_ + ( + const f77_int* n, + const float* alpha, + float* x, const f77_int* incx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx ); + dim_t n0; + float* x0; + inc_t incx0; + /* Initialize BLIS. */ + //bli_init_auto(); + + if (*n == 0 || alpha == NULL) { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + bli_sscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n0, + (float *)alpha, + x0, incx0, + NULL + ); + } + else{ + PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE,\ + n0, \ + (float *)alpha,\ + x0, incx0,\ + NULL, \ + NULL \ + );\ + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) +} + +void dscal_ + ( + const f77_int* n, + const double* alpha, + double* x, const f77_int* incx + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx ); + dim_t n0; + double* x0; + inc_t incx0; + + /* Initialize BLIS */ + //bli_init_auto(); + + if (*n == 0 || alpha == NULL) { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + return; + } + + /* Convert typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE){ + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n0, + (double*) alpha, + x0, incx0, + NULL + ); + } + else{ + PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE,\ + n0, \ + (double *)alpha,\ + x0, incx0,\ + NULL, \ + NULL \ + );\ + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) +} + +INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv ) + +#endif diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c index 526414f33..d65342647 100644 --- a/frame/compat/bla_swap.c +++ b/frame/compat/bla_swap.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -83,198 +83,5 @@ void PASTEF77(ch,blasname) \ } #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC - -void sswap_ - ( - const f77_int* n, - float* x, const f77_int* incx, - float* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy); - dim_t n0; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = (y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = (y); - incy0 = ( inc_t )(*incy); - } - - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { -/* Call BLIS kernel */ - bli_sswapv_zen_int8 - ( - n0, - x0, incx0, - y0, incy0, - NULL - ); - } - else{ - PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \ - ( \ - n0, \ - x0, incx0, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) -} - -void dswap_ - ( - const f77_int* n, - double* x, const f77_int* incx, - double* y, const f77_int* incy - ) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) - AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy); - dim_t n0; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; - - /* Initialize BLIS. */ -// bli_init_auto(); - - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - - x0 = (x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - - if ( *incy < 0 ) - { - y0 = (y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - - } - else - { - y0 = (y); - incy0 = ( inc_t )(*incy); - } - - - /* Call BLIS kernel */ - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { - bli_dswapv_zen_int8 - ( - n0, - x0, incx0, - y0, incy0, - NULL - ); - } - else{ - PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \ - ( \ - n0, \ - x0, incx0, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ - } - - /* Finalize BLIS. */ -// bli_finalize_auto(); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) -} - -INSERT_GENTFUNC_BLAS_CZ( swap, swapv ) - -#else INSERT_GENTFUNC_BLAS( swap, swapv ) #endif -#endif diff --git a/frame/compat/bla_swap_amd.c b/frame/compat/bla_swap_amd.c new file mode 100644 index 000000000..617c78a4a --- /dev/null +++ b/frame/compat/bla_swap_amd.c @@ -0,0 +1,268 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + ftype* x, const f77_int* incx, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#ifdef BLIS_ENABLE_BLAS + +void sswap_ + ( + const f77_int* n, + float* x, const f77_int* incx, + float* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy); + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = (y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = (y); + incy0 = ( inc_t )(*incy); + } + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + /* Call BLIS kernel */ + bli_sswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + } + else{ + PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \ + ( \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) +} + +void dswap_ + ( + const f77_int* n, + double* x, const f77_int* incx, + double* y, const f77_int* incy + ) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) + AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy); + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = (y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = (y); + incy0 = ( inc_t )(*incy); + } + + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + bli_dswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + } + else{ + PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \ + ( \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ + } + + /* Finalize BLIS. */ +// bli_finalize_auto(); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) +} + +INSERT_GENTFUNC_BLAS_CZ( swap, swapv ) + + +#endif diff --git a/frame/compat/bla_trsm.c b/frame/compat/bla_trsm.c index fa8f0dacd..fea7ba6f1 100644 --- a/frame/compat/bla_trsm.c +++ b/frame/compat/bla_trsm.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -380,1175 +380,5 @@ void PASTEF77(ch,blasname) \ #endif #ifdef BLIS_ENABLE_BLAS -#ifdef BLIS_CONFIG_EPYC - -void strsm_ -( - const f77_char* side, - const f77_char* uploa, - const f77_char* transa, - const f77_char* diaga, - const f77_int* m, - const f77_int* n, - const float* alpha, - const float* a, const f77_int* lda, - float* b, const f77_int* ldb -) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'd', - *side, *uploa,*transa, *diaga, *m, *n, - (void*)alpha,*lda, *ldb); - - side_t blis_side; - uplo_t blis_uploa; - trans_t blis_transa; - diag_t blis_diaga; - dim_t m0, n0; - conj_t conja = BLIS_NO_CONJUGATE ; - - /* Initialize BLIS. */ - bli_init_auto(); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(trsm) - ( - MKSTR(s), - MKSTR(trsm), - side, - uploa, - transa, - diaga, - m, - n, - lda, - ldb - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_side( *side, &blis_side ); - bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); - bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1( *m, m0 ); - bli_convert_blas_dim1( *n, n0 ); - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const num_t dt = BLIS_FLOAT; - - if( n0 == 1 ) - { - if( blis_side == BLIS_LEFT ) - { - if(bli_is_notrans(blis_transa)) - { - bli_strsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (float*)alpha, - (float*)a, rs_a, cs_a, - (float*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - bli_strsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (float*)alpha, - (float*)a, rs_a, cs_a, - (float*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) - { - /* b = alpha * b; */ - bli_sscalv_ex - ( - conja, - m0, - (float*)alpha, - b, rs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - float inva = 1.0/ *a; - for(dim_t indx = 0; indx < m0; indx ++) - { - b[indx] = ( inva * b[indx] ); - } - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( m0 == 1 ) - { - if(blis_side == BLIS_RIGHT) - { - if(bli_is_notrans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_strsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (float*)alpha, - (float*)a, cs_a, rs_a, - (float*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_strsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (float*)alpha, - (float*)a, cs_a, rs_a, - (float*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) - { - /* b = alpha * b; */ - bli_sscalv_ex - ( - conja, - n0, - (float*)alpha, - b, cs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - float inva = 1.0/ *a; - for(dim_t indx = 0; indx < n0; indx ++) - { - b[indx*cs_b] = (inva * b[indx*cs_b] ); - } - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - const struc_t struca = BLIS_TRIANGULAR; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - - dim_t mn0_a; - - bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); - - bli_obj_init_finish_1x1( dt, (float*)alpha, &alphao ); - - bli_obj_init_finish( dt, mn0_a, mn0_a, (float*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0, n0, (float*)b, rs_b, cs_b, &bo ); - - bli_obj_set_uplo( blis_uploa, &ao ); - bli_obj_set_diag( blis_diaga, &ao ); - bli_obj_set_conjtrans( blis_transa, &ao ); - - bli_obj_set_struc( struca, &ao ); - - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { -#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM - /* bli_strsm_small is performing better existing native - * implementations for [m,n]<=1000 for single thread. - * In case of multithread when [m,n]<=128 sinlge thread implemenation - * is doing better than native multithread */ - bool nt = bli_thread_get_is_parallel(); - if((nt==0 && m0<=1000 && n0<=1000) || - (nt && (m0+n0)<320) ) - { - err_t status; - status = bli_trsm_small - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - if (status == BLIS_SUCCESS) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } -#endif - } - bli_trsmnat - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) - /* Finalize BLIS. */ - bli_finalize_auto(); -} - -void dtrsm_ -( - const f77_char* side, - const f77_char* uploa, - const f77_char* transa, - const f77_char* diaga, - const f77_int* m, - const f77_int* n, - const double* alpha, - const double* a, const f77_int* lda, - double* b, const f77_int* ldb -) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'd', - *side, *uploa,*transa, *diaga, *m, *n, - (void*)alpha,*lda, *ldb); - - side_t blis_side; - uplo_t blis_uploa; - trans_t blis_transa; - diag_t blis_diaga; - dim_t m0, n0; - conj_t conja = BLIS_NO_CONJUGATE ; - - /* Initialize BLIS. */ - bli_init_auto(); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(trsm) - ( - MKSTR(d), - MKSTR(trsm), - side, - uploa, - transa, - diaga, - m, - n, - lda, - ldb - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_side( *side, &blis_side ); - bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); - bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1( *m, m0 ); - bli_convert_blas_dim1( *n, n0 ); - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const num_t dt = BLIS_DOUBLE; - - if( n0 == 1 ) - { - if( blis_side == BLIS_LEFT ) - { - if(bli_is_notrans(blis_transa)) - { - bli_dtrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - bli_dtrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (double*)alpha, - (double*)a, rs_a, cs_a, - (double*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) - { - /* b = alpha * b; */ - bli_dscalv_ex - ( - conja, - m0, - (double*)alpha, - b, rs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - double inva = 1.0/ *a; - for(dim_t indx = 0; indx < m0; indx ++) - { - b[indx] = ( inva * b[indx] ); - } - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( m0 == 1 ) - { - if(blis_side == BLIS_RIGHT) - { - if(bli_is_notrans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_dtrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (double*)alpha, - (double*)a, cs_a, rs_a, - (double*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_dtrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (double*)alpha, - (double*)a, cs_a, rs_a, - (double*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) - { - /* b = alpha * b; */ - bli_dscalv_ex - ( - conja, - n0, - (double*)alpha, - b, cs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - double inva = 1.0/ *a; - for(dim_t indx = 0; indx < n0; indx ++) - { - b[indx*cs_b] = (inva * b[indx*cs_b] ); - } - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - - const struc_t struca = BLIS_TRIANGULAR; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - - dim_t mn0_a; - - bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); - - bli_obj_init_finish_1x1( dt, (double*)alpha, &alphao ); - - bli_obj_init_finish( dt, mn0_a, mn0_a, (double*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0, n0, (double*)b, rs_b, cs_b, &bo ); - - bli_obj_set_uplo( blis_uploa, &ao ); - bli_obj_set_diag( blis_diaga, &ao ); - bli_obj_set_conjtrans( blis_transa, &ao ); - - bli_obj_set_struc( struca, &ao ); - - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (bamdzen) { -#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM - /* bli_dtrsm_small is performing better existing native - * implementations for [m,n]<=1000 for single thread. - * In case of multithread when [m,n]<=128 sinlge thread implemenation - * is doing better than native multithread */ - bool nt = bli_thread_get_is_parallel(); - if((nt==0 && m0<=1000 && n0<=1000) || - (nt && (m0+n0)<320) ) - { - err_t status; - status = bli_trsm_small - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - if (status == BLIS_SUCCESS) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } -#endif - } - bli_trsmnat - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) - /* Finalize BLIS. */ - bli_finalize_auto(); -} -#if 0 -void ztrsm_ -( - const f77_char* side, - const f77_char* uploa, - const f77_char* transa, - const f77_char* diaga, - const f77_int* m, - const f77_int* n, - const dcomplex* alpha, - const dcomplex* a, const f77_int* lda, - dcomplex* b, const f77_int* ldb -) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'z', - *side, *uploa,*transa, *diaga, *m, *n, - (void*)alpha,*lda, *ldb); - - side_t blis_side; - uplo_t blis_uploa; - trans_t blis_transa; - diag_t blis_diaga; - dim_t m0, n0; - conj_t conja = BLIS_NO_CONJUGATE; - - /* Initialize BLIS. */ - bli_init_auto(); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(trsm) - ( - MKSTR(z), - MKSTR(trsm), - side, - uploa, - transa, - diaga, - m, - n, - lda, - ldb - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_side( *side, &blis_side ); - bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); - bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1( *m, m0 ); - bli_convert_blas_dim1( *n, n0 ); - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const num_t dt = BLIS_DCOMPLEX; - - - if( n0 == 1 ) - { - if( blis_side == BLIS_LEFT ) - { - if(bli_is_notrans(blis_transa)) - { - bli_ztrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (dcomplex*)alpha, - (dcomplex*)a, rs_a, cs_a, - (dcomplex*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - bli_ztrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (dcomplex*)alpha, - (dcomplex*)a, rs_a, cs_a, - (dcomplex*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) - { - bli_zscalv_ex - ( - conja, - m0, - (dcomplex*)alpha, - (dcomplex*)b, rs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - dcomplex inva = {1.0, 0.0}; - dcomplex a_dup; - /** - * For conjugate transpose and non-unit diagonal - * kernel, negating imaginary part of A. - * As the dimension of A is 1x1, there's going to - * be only one 1 element of A. - */ - if(*transa == 'C' && *diaga == 'N') - { - a_dup.real = a->real; - a_dup.imag = a->imag * -1.0; - } - else - { - a_dup.real = a->real; - a_dup.imag = a->imag; - } - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_zinvscals(a_dup, inva); -#else - inva.real = a_dup.real; - inva.imag = a_dup.imag; -#endif - for(dim_t indx = 0; indx < m0; indx ++) - { -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_zscals(inva, b[indx]) -#else - - bli_zinvscals(inva, b[indx]) -#endif - } - - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( m0 == 1 ) - { - if(blis_side == BLIS_RIGHT) - { - if(bli_is_notrans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_ztrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (dcomplex*)alpha, - (dcomplex*)a, cs_a, rs_a, - (dcomplex*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_ztrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (dcomplex*)alpha, - (dcomplex*)a, cs_a, rs_a, - (dcomplex*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) - { - bli_zscalv_ex - ( - conja, - n0, - (dcomplex*)alpha, - (dcomplex*)b, cs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - dcomplex inva = {1.0, 0.0}; - dcomplex a_dup; - /** - * For conjugate transpose and non-unit diagonal - * kernel, negating imaginary part of A. - * As the dimension of A is 1x1, there's going to - * be only one 1 element of A. - */ - if(*transa == 'C' && *diaga == 'N') - { - a_dup.real = a->real; - a_dup.imag = a->imag * -1.0; - } - else - { - a_dup.real = a->real; - a_dup.imag = a->imag; - } - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_zinvscals(a_dup, inva); -#else - inva.real = a_dup.real; - inva.imag = a_dup.imag; -#endif - for(dim_t indx = 0; indx < n0; indx ++) - { -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_zscals(inva ,b[indx * cs_b]) -#else - - bli_zinvscals(inva ,b[indx * cs_b]) -#endif - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - - } - } - - const struc_t struca = BLIS_TRIANGULAR; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - - dim_t mn0_a; - - bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); - - bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); - - bli_obj_init_finish( dt, mn0_a, mn0_a, (dcomplex*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0, n0, (dcomplex*)b, rs_b, cs_b, &bo ); - - bli_obj_set_uplo( blis_uploa, &ao ); - bli_obj_set_diag( blis_diaga, &ao ); - bli_obj_set_conjtrans( blis_transa, &ao ); - - bli_obj_set_struc( struca, &ao ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM - /* bli_ztrsm_small is performing better existing native - * implementations for [m,n]<=1000 for single thread. - * In case of multithread when [m,n]<=128 sinlge thread implemenation - * is doing better than native multithread */ - bool nt = bli_thread_get_is_parallel(); - if((nt==0 && m0<=500 && n0<=500) || - (nt && (m0+n0)<128) ) - { - err_t status; - status = bli_trsm_small - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - if (status == BLIS_SUCCESS) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } -#endif - - bli_trsmnat - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) - /* Finalize BLIS. */ - bli_finalize_auto(); -} -#endif -#if 0 -void ctrsm_ -( - const f77_char* side, - const f77_char* uploa, - const f77_char* transa, - const f77_char* diaga, - const f77_int* m, - const f77_int* n, - const scomplex* alpha, - const scomplex* a, const f77_int* lda, - scomplex* b, const f77_int* ldb -) -{ - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) - AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 's', - *side, *uploa,*transa, *diaga, *m, *n, - (void*)alpha,*lda, *ldb); - - side_t blis_side; - uplo_t blis_uploa; - trans_t blis_transa; - diag_t blis_diaga; - dim_t m0, n0; - conj_t conja = BLIS_NO_CONJUGATE; - - /* Initialize BLIS. */ - bli_init_auto(); - - /* Perform BLAS parameter checking. */ - PASTEBLACHK(trsm) - ( - MKSTR(c), - MKSTR(trsm), - side, - uploa, - transa, - diaga, - m, - n, - lda, - ldb - ); - - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ - bli_param_map_netlib_to_blis_side( *side, &blis_side ); - bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); - bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); - - /* Typecast BLAS integers to BLIS integers. */ - bli_convert_blas_dim1( *m, m0 ); - bli_convert_blas_dim1( *n, n0 ); - - /* Set the row and column strides of the matrix operands. */ - const inc_t rs_a = 1; - const inc_t cs_a = *lda; - const inc_t rs_b = 1; - const inc_t cs_b = *ldb; - const num_t dt = BLIS_SCOMPLEX; - - - if( n0 == 1 ) - { - if( blis_side == BLIS_LEFT ) - { - if(bli_is_notrans(blis_transa)) - { - bli_ctrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (scomplex*)alpha, - (scomplex*)a, rs_a, cs_a, - (scomplex*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - bli_ctrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - m0, - (scomplex*)alpha, - (scomplex*)a, rs_a, cs_a, - (scomplex*)b, rs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) - { - bli_cscalv_ex - ( - conja, - m0, - (scomplex*)alpha, - (scomplex*)b, rs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - scomplex inva = {1.0, 0.0}; - scomplex a_dup; - /** - * For conjugate transpose and non-unit diagonal - * kernel, negating imaginary part of A. - * As the dimension of A is 1x1, there's going to - * be only one 1 element of A. - */ - if(*transa == 'C' && *diaga == 'N') - { - a_dup.real = a->real; - a_dup.imag = a->imag * -1.0; - } - else - { - a_dup.real = a->real; - a_dup.imag = a->imag; - } - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_cinvscals(a_dup, inva); -#else - inva.real = a_dup.real; - inva.imag = a_dup.imag; -#endif - - for(dim_t indx = 0; indx < m0; indx ++) - { -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_cscals(inva ,b[indx]) -#else - bli_cinvscals(inva, b[indx]) -#endif - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - - } - } - else if( m0 == 1 ) - { - if(blis_side == BLIS_RIGHT) - { - if(bli_is_notrans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_ctrsv_unf_var1 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (scomplex*)alpha, - (scomplex*)a, cs_a, rs_a, - (scomplex*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - else if(bli_is_trans(blis_transa)) - { - if(blis_uploa == BLIS_UPPER) - blis_uploa = BLIS_LOWER; - else - blis_uploa = BLIS_UPPER; - - bli_ctrsv_unf_var2 - ( - blis_uploa, - blis_transa, - blis_diaga, - n0, - (scomplex*)alpha, - (scomplex*)a, cs_a, rs_a, - (scomplex*)b, cs_b, - NULL - ); - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) - { - bli_cscalv_ex - ( - conja, - n0, - (scomplex*)alpha, - (scomplex*)b, cs_b, - NULL, - NULL - ); - if(blis_diaga == BLIS_NONUNIT_DIAG) - { - scomplex inva = {1.0, 0.0}; - scomplex a_dup; - /** - * For conjugate transpose and non-unit diagonal - * kernel, negating imaginary part of A. - * As the dimension of A is 1x1, there's going to - * be only one 1 element of A. - */ - if(*transa == 'C' && *diaga == 'N') - { - a_dup.real = a->real; - a_dup.imag = a->imag * -1.0; - } - else - { - a_dup.real = a->real; - a_dup.imag = a->imag; - } - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_cinvscals(a_dup, inva) -#else - inva.real = a_dup.real; - inva.imag = a_dup.imag; -#endif - for(dim_t indx = 0; indx < n0; indx ++) - { -#ifdef BLIS_ENABLE_TRSM_PREINVERSION - bli_cscals(inva ,b[indx * cs_b]) -#else - bli_cinvscals(inva, b[indx * cs_b]) -#endif - - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - return; - } - } - - const struc_t struca = BLIS_TRIANGULAR; - - obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; - obj_t ao = BLIS_OBJECT_INITIALIZER; - obj_t bo = BLIS_OBJECT_INITIALIZER; - - dim_t mn0_a; - - bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); - - bli_obj_init_finish_1x1( dt, (scomplex*)alpha, &alphao ); - - bli_obj_init_finish( dt, mn0_a, mn0_a, (scomplex*)a, rs_a, cs_a, &ao ); - bli_obj_init_finish( dt, m0, n0, (scomplex*)b, rs_b, cs_b, &bo ); - - bli_obj_set_uplo( blis_uploa, &ao ); - bli_obj_set_diag( blis_diaga, &ao ); - bli_obj_set_conjtrans( blis_transa, &ao ); - - bli_obj_set_struc( struca, &ao ); -#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM - /* bli_ztrsm_small is performing better existing native - * implementations for [m,n]<=1000 for single thread. - * In case of multithread when [m,n]<=128 sinlge thread implemenation - * is doing better than native multithread */ - bool nt = bli_thread_get_is_parallel(); - if((nt==0 && m0<=1000 && n0<=1000) || - (nt && (m0+n0)<320) ) - { - err_t status; - status = bli_trsm_small - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - if (status == BLIS_SUCCESS) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); - /* Finalize BLIS. */ - bli_finalize_auto(); - return; - } - } -#endif - bli_trsmnat - ( - blis_side, - &alphao, - &ao, - &bo, - NULL, - NULL - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) - /* Finalize BLIS. */ - bli_finalize_auto(); -} -#endif -INSERT_GENTFUNC_BLAS_CZ( trsm, trsm ) -#else INSERT_GENTFUNC_BLAS( trsm, trsm ) #endif -#endif diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c new file mode 100644 index 000000000..21b2a1598 --- /dev/null +++ b/frame/compat/bla_trsm_amd.c @@ -0,0 +1,1544 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// + +#ifdef BLIS_BLAS3_CALLS_TAPI + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* side, \ + const f77_char* uploa, \ + const f77_char* transa, \ + const f77_char* diaga, \ + const f77_int* m, \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + ftype* b, const f77_int* ldb \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \ +\ + side_t blis_side; \ + uplo_t blis_uploa; \ + trans_t blis_transa; \ + diag_t blis_diaga; \ + dim_t m0, n0; \ + inc_t rs_a, cs_a; \ + inc_t rs_b, cs_b; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + side, \ + uploa, \ + transa, \ + diaga, \ + m, \ + n, \ + lda, \ + ldb \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + rs_a = 1; \ + cs_a = *lda; \ + rs_b = 1; \ + cs_b = *ldb; \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_side, \ + blis_uploa, \ + blis_transa, \ + blis_diaga, \ + m0, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, rs_b, cs_b, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#else + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* side, \ + const f77_char* uploa, \ + const f77_char* transa, \ + const f77_char* diaga, \ + const f77_int* m, \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + ftype* b, const f77_int* ldb \ + ) \ +{ \ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \ + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *side, *uploa, \ + *transa, *diaga, *m, *n, (void*)alpha, *lda, *ldb); \ + side_t blis_side; \ + uplo_t blis_uploa; \ + trans_t blis_transa; \ + diag_t blis_diaga; \ + dim_t m0, n0; \ + ftype a_conj; \ + conj_t conja = BLIS_NO_CONJUGATE ; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + side, \ + uploa, \ + transa, \ + diaga, \ + m, \ + n, \ + lda, \ + ldb \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* ----------------------------------------------------------- */ \ + /* TRSM API: AX = B, where X = B */ \ + /* CALL TRSV when X & B are vector and when A is Matrix */ \ + /* Case 1: LEFT : TRSM, B(mxn) = A(mxm) * X(mxn) */ \ + /* Case 2: RIGHT : TRSM, B(mxn) = X(mxn) * A(nxn) */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | | A | X | B | Implementation | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | LEFT | mxm | mxn | mxn | | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | n = 1 | mxm | mx1 | mx1 | TRSV | */ \ + /* | m = 1 | 1x1 | 1xn | 1xn | INVSCALS | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | | X | A | B | Implementation | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | RIGHT | mxn | nxn | mxn | | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* | n = 1 | mx1 | 1x1 | mx1 | Transpose and INVSCALS| */ \ + /* | m = 1 | 1xn | nxn | 1xn | Transpose and TRSV | */ \ + /* |--------|-------|-------|-------|------------------------| */ \ + /* If Transpose(A) uplo = lower then uplo = higher */ \ + /* If Transpose(A) uplo = higher then uplo = lower */ \ + /* ----------------------------------------------------------- */ \ +\ + if( n0 == 1 ) \ + { \ + if( blis_side == BLIS_LEFT ) \ + { \ + if(bli_is_notrans(blis_transa)) \ + { \ + PASTEMAC(ch, trsv_unf_var2) \ + ( \ + blis_uploa, \ + blis_transa, \ + blis_diaga, \ + m0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, rs_b, \ + NULL \ + ); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + else if(bli_is_trans(blis_transa)) \ + { \ + PASTEMAC(ch, trsv_unf_var1) \ + ( \ + blis_uploa, \ + blis_transa, \ + blis_diaga, \ + m0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, rs_b, \ + NULL \ + ); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + } \ + else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) \ + { \ + /* b = alpha * b; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + conja, \ + m0, \ + (ftype*)alpha, \ + b, rs_b, \ + NULL, \ + NULL \ + ); \ + if(blis_diaga == BLIS_NONUNIT_DIAG) \ + { \ + conja = bli_extract_conj( blis_transa ); \ + PASTEMAC(ch,copycjs)( conja, *a, a_conj ); \ + for(int indx = 0; indx < m0; indx ++) \ + { \ + PASTEMAC(ch,invscals)( a_conj, b[indx] ); \ + } \ + }\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + } \ + else if( m0 == 1 ) \ + { \ + if(blis_side == BLIS_RIGHT) \ + { \ + if(bli_is_notrans(blis_transa)) \ + { \ + if(blis_uploa == BLIS_UPPER) \ + blis_uploa = BLIS_LOWER; \ + else \ + blis_uploa = BLIS_UPPER; \ + PASTEMAC(ch, trsv_unf_var1)( \ + blis_uploa, \ + blis_transa, \ + blis_diaga, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, cs_a, rs_a, \ + (ftype*)b, cs_b, \ + NULL); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + else if(bli_is_trans(blis_transa)) \ + { \ + if(blis_uploa == BLIS_UPPER) \ + blis_uploa = BLIS_LOWER; \ + else \ + blis_uploa = BLIS_UPPER; \ + PASTEMAC(ch, trsv_unf_var2)( \ + blis_uploa, \ + blis_transa, \ + blis_diaga, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, cs_a, rs_a, \ + (ftype*)b, cs_b, \ + NULL); \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + } \ + else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) \ + { \ + /* b = alpha * b; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + conja, \ + n0, \ + (ftype*)alpha, \ + b, cs_b, \ + NULL, \ + NULL \ + ); \ + if(blis_diaga == BLIS_NONUNIT_DIAG) \ + { \ + conja = bli_extract_conj( blis_transa ); \ + PASTEMAC(ch,copycjs)( conja, *a, a_conj ); \ + for(int indx = 0; indx < n0; indx ++) \ + { \ + PASTEMAC(ch,invscals)( a_conj, b[indx*cs_b] ); \ + }\ + } \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + return; \ + } \ + } \ +\ + const struc_t struca = BLIS_TRIANGULAR; \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t mn0_a; \ +\ + bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \ +\ + bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ +\ + bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m0, n0, (ftype*)b, rs_b, cs_b, &bo ); \ +\ + bli_obj_set_uplo( blis_uploa, &ao ); \ + bli_obj_set_diag( blis_diaga, &ao ); \ + bli_obj_set_conjtrans( blis_transa, &ao ); \ +\ + bli_obj_set_struc( struca, &ao ); \ +\ + PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ + ( \ + blis_side, \ + &alphao, \ + &ao, \ + &bo, \ + NULL, \ + NULL \ + ); \ +\ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#endif + +#ifdef BLIS_ENABLE_BLAS + +void strsm_ +( + const f77_char* side, + const f77_char* uploa, + const f77_char* transa, + const f77_char* diaga, + const f77_int* m, + const f77_int* n, + const float* alpha, + const float* a, const f77_int* lda, + float* b, const f77_int* ldb +) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'd', + *side, *uploa,*transa, *diaga, *m, *n, + (void*)alpha,*lda, *ldb); + + side_t blis_side; + uplo_t blis_uploa; + trans_t blis_transa; + diag_t blis_diaga; + dim_t m0, n0; + conj_t conja = BLIS_NO_CONJUGATE ; + + /* Initialize BLIS. */ + bli_init_auto(); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(trsm) + ( + MKSTR(s), + MKSTR(trsm), + side, + uploa, + transa, + diaga, + m, + n, + lda, + ldb + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const num_t dt = BLIS_FLOAT; + + if( n0 == 1 ) + { + if( blis_side == BLIS_LEFT ) + { + if(bli_is_notrans(blis_transa)) + { + bli_strsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (float*)alpha, + (float*)a, rs_a, cs_a, + (float*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + bli_strsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (float*)alpha, + (float*)a, rs_a, cs_a, + (float*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) + { + /* b = alpha * b; */ + bli_sscalv_ex + ( + conja, + m0, + (float*)alpha, + b, rs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + float inva = 1.0/ *a; + for(dim_t indx = 0; indx < m0; indx ++) + { + b[indx] = ( inva * b[indx] ); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( m0 == 1 ) + { + if(blis_side == BLIS_RIGHT) + { + if(bli_is_notrans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_strsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (float*)alpha, + (float*)a, cs_a, rs_a, + (float*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_strsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (float*)alpha, + (float*)a, cs_a, rs_a, + (float*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) + { + /* b = alpha * b; */ + bli_sscalv_ex + ( + conja, + n0, + (float*)alpha, + b, cs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + float inva = 1.0/ *a; + for(dim_t indx = 0; indx < n0; indx ++) + { + b[indx*cs_b] = (inva * b[indx*cs_b] ); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + const struc_t struca = BLIS_TRIANGULAR; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + + dim_t mn0_a; + + bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); + + bli_obj_init_finish_1x1( dt, (float*)alpha, &alphao ); + + bli_obj_init_finish( dt, mn0_a, mn0_a, (float*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0, n0, (float*)b, rs_b, cs_b, &bo ); + + bli_obj_set_uplo( blis_uploa, &ao ); + bli_obj_set_diag( blis_diaga, &ao ); + bli_obj_set_conjtrans( blis_transa, &ao ); + + bli_obj_set_struc( struca, &ao ); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM + /* bli_strsm_small is performing better existing native + * implementations for [m,n]<=1000 for single thread. + * In case of multithread when [m,n]<=128 sinlge thread implemenation + * is doing better than native multithread */ + bool nt = bli_thread_get_is_parallel(); + if((nt==0 && m0<=1000 && n0<=1000) || + (nt && (m0+n0)<320) ) + { + err_t status; + status = bli_trsm_small + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + if (status == BLIS_SUCCESS) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + } +#endif + } + bli_trsmnat + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) + /* Finalize BLIS. */ + bli_finalize_auto(); +} + +void dtrsm_ +( + const f77_char* side, + const f77_char* uploa, + const f77_char* transa, + const f77_char* diaga, + const f77_int* m, + const f77_int* n, + const double* alpha, + const double* a, const f77_int* lda, + double* b, const f77_int* ldb +) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'd', + *side, *uploa,*transa, *diaga, *m, *n, + (void*)alpha,*lda, *ldb); + + side_t blis_side; + uplo_t blis_uploa; + trans_t blis_transa; + diag_t blis_diaga; + dim_t m0, n0; + conj_t conja = BLIS_NO_CONJUGATE ; + + /* Initialize BLIS. */ + bli_init_auto(); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(trsm) + ( + MKSTR(d), + MKSTR(trsm), + side, + uploa, + transa, + diaga, + m, + n, + lda, + ldb + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const num_t dt = BLIS_DOUBLE; + + if( n0 == 1 ) + { + if( blis_side == BLIS_LEFT ) + { + if(bli_is_notrans(blis_transa)) + { + bli_dtrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + bli_dtrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (double*)alpha, + (double*)a, rs_a, cs_a, + (double*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) + { + /* b = alpha * b; */ + bli_dscalv_ex + ( + conja, + m0, + (double*)alpha, + b, rs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + double inva = 1.0/ *a; + for(dim_t indx = 0; indx < m0; indx ++) + { + b[indx] = ( inva * b[indx] ); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( m0 == 1 ) + { + if(blis_side == BLIS_RIGHT) + { + if(bli_is_notrans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_dtrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (double*)alpha, + (double*)a, cs_a, rs_a, + (double*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_dtrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (double*)alpha, + (double*)a, cs_a, rs_a, + (double*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) + { + /* b = alpha * b; */ + bli_dscalv_ex + ( + conja, + n0, + (double*)alpha, + b, cs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + double inva = 1.0/ *a; + for(dim_t indx = 0; indx < n0; indx ++) + { + b[indx*cs_b] = (inva * b[indx*cs_b] ); + } + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + + const struc_t struca = BLIS_TRIANGULAR; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + + dim_t mn0_a; + + bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); + + bli_obj_init_finish_1x1( dt, (double*)alpha, &alphao ); + + bli_obj_init_finish( dt, mn0_a, mn0_a, (double*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0, n0, (double*)b, rs_b, cs_b, &bo ); + + bli_obj_set_uplo( blis_uploa, &ao ); + bli_obj_set_diag( blis_diaga, &ao ); + bli_obj_set_conjtrans( blis_transa, &ao ); + + bli_obj_set_struc( struca, &ao ); + + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == TRUE) { + +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM + /* bli_dtrsm_small is performing better existing native + * implementations for [m,n]<=1000 for single thread. + * In case of multithread when [m,n]<=128 sinlge thread implemenation + * is doing better than native multithread */ + bool nt = bli_thread_get_is_parallel(); + if((nt==0 && m0<=1000 && n0<=1000) || + (nt && (m0+n0)<320) ) + { + err_t status; + status = bli_trsm_small + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + if (status == BLIS_SUCCESS) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + } +#endif + } + bli_trsmnat + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) + /* Finalize BLIS. */ + bli_finalize_auto(); +} +#if 0 +void ztrsm_ +( + const f77_char* side, + const f77_char* uploa, + const f77_char* transa, + const f77_char* diaga, + const f77_int* m, + const f77_int* n, + const dcomplex* alpha, + const dcomplex* a, const f77_int* lda, + dcomplex* b, const f77_int* ldb +) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'z', + *side, *uploa,*transa, *diaga, *m, *n, + (void*)alpha,*lda, *ldb); + + side_t blis_side; + uplo_t blis_uploa; + trans_t blis_transa; + diag_t blis_diaga; + dim_t m0, n0; + conj_t conja = BLIS_NO_CONJUGATE; + + /* Initialize BLIS. */ + bli_init_auto(); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(trsm) + ( + MKSTR(z), + MKSTR(trsm), + side, + uploa, + transa, + diaga, + m, + n, + lda, + ldb + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const num_t dt = BLIS_DCOMPLEX; + + + if( n0 == 1 ) + { + if( blis_side == BLIS_LEFT ) + { + if(bli_is_notrans(blis_transa)) + { + bli_ztrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (dcomplex*)alpha, + (dcomplex*)a, rs_a, cs_a, + (dcomplex*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + bli_ztrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (dcomplex*)alpha, + (dcomplex*)a, rs_a, cs_a, + (dcomplex*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) + { + bli_zscalv_ex + ( + conja, + m0, + (dcomplex*)alpha, + (dcomplex*)b, rs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + dcomplex inva = {1.0, 0.0}; + dcomplex a_dup; + /** + * For conjugate transpose and non-unit diagonal + * kernel, negating imaginary part of A. + * As the dimension of A is 1x1, there's going to + * be only one 1 element of A. + */ + if(*transa == 'C' && *diaga == 'N') + { + a_dup.real = a->real; + a_dup.imag = a->imag * -1.0; + } + else + { + a_dup.real = a->real; + a_dup.imag = a->imag; + } + +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_zinvscals(a_dup, inva); +#else + inva.real = a_dup.real; + inva.imag = a_dup.imag; +#endif + for(dim_t indx = 0; indx < m0; indx ++) + { +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_zscals(inva, b[indx]) +#else + + bli_zinvscals(inva, b[indx]) +#endif + } + + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( m0 == 1 ) + { + if(blis_side == BLIS_RIGHT) + { + if(bli_is_notrans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_ztrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (dcomplex*)alpha, + (dcomplex*)a, cs_a, rs_a, + (dcomplex*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_ztrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (dcomplex*)alpha, + (dcomplex*)a, cs_a, rs_a, + (dcomplex*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) + { + bli_zscalv_ex + ( + conja, + n0, + (dcomplex*)alpha, + (dcomplex*)b, cs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + dcomplex inva = {1.0, 0.0}; + dcomplex a_dup; + /** + * For conjugate transpose and non-unit diagonal + * kernel, negating imaginary part of A. + * As the dimension of A is 1x1, there's going to + * be only one 1 element of A. + */ + if(*transa == 'C' && *diaga == 'N') + { + a_dup.real = a->real; + a_dup.imag = a->imag * -1.0; + } + else + { + a_dup.real = a->real; + a_dup.imag = a->imag; + } + +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_zinvscals(a_dup, inva); +#else + inva.real = a_dup.real; + inva.imag = a_dup.imag; +#endif + for(dim_t indx = 0; indx < n0; indx ++) + { +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_zscals(inva ,b[indx * cs_b]) +#else + + bli_zinvscals(inva ,b[indx * cs_b]) +#endif + } + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + + } + } + + const struc_t struca = BLIS_TRIANGULAR; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + + dim_t mn0_a; + + bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); + + bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao ); + + bli_obj_init_finish( dt, mn0_a, mn0_a, (dcomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0, n0, (dcomplex*)b, rs_b, cs_b, &bo ); + + bli_obj_set_uplo( blis_uploa, &ao ); + bli_obj_set_diag( blis_diaga, &ao ); + bli_obj_set_conjtrans( blis_transa, &ao ); + + bli_obj_set_struc( struca, &ao ); + +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM + /* bli_ztrsm_small is performing better existing native + * implementations for [m,n]<=1000 for single thread. + * In case of multithread when [m,n]<=128 sinlge thread implemenation + * is doing better than native multithread */ + bool nt = bli_thread_get_is_parallel(); + if((nt==0 && m0<=500 && n0<=500) || + (nt && (m0+n0)<128) ) + { + err_t status; + status = bli_trsm_small + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + if (status == BLIS_SUCCESS) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + } +#endif + + bli_trsmnat + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) + /* Finalize BLIS. */ + bli_finalize_auto(); +} +#endif +#if 0 +void ctrsm_ +( + const f77_char* side, + const f77_char* uploa, + const f77_char* transa, + const f77_char* diaga, + const f77_int* m, + const f77_int* n, + const scomplex* alpha, + const scomplex* a, const f77_int* lda, + scomplex* b, const f77_int* ldb +) +{ + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) + AOCL_DTL_LOG_TRSM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 's', + *side, *uploa,*transa, *diaga, *m, *n, + (void*)alpha,*lda, *ldb); + + side_t blis_side; + uplo_t blis_uploa; + trans_t blis_transa; + diag_t blis_diaga; + dim_t m0, n0; + conj_t conja = BLIS_NO_CONJUGATE; + + /* Initialize BLIS. */ + bli_init_auto(); + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(trsm) + ( + MKSTR(c), + MKSTR(trsm), + side, + uploa, + transa, + diaga, + m, + n, + lda, + ldb + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + bli_param_map_netlib_to_blis_side( *side, &blis_side ); + bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); + bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); + + /* Typecast BLAS integers to BLIS integers. */ + bli_convert_blas_dim1( *m, m0 ); + bli_convert_blas_dim1( *n, n0 ); + + /* Set the row and column strides of the matrix operands. */ + const inc_t rs_a = 1; + const inc_t cs_a = *lda; + const inc_t rs_b = 1; + const inc_t cs_b = *ldb; + const num_t dt = BLIS_SCOMPLEX; + + + if( n0 == 1 ) + { + if( blis_side == BLIS_LEFT ) + { + if(bli_is_notrans(blis_transa)) + { + bli_ctrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (scomplex*)alpha, + (scomplex*)a, rs_a, cs_a, + (scomplex*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + bli_ctrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + m0, + (scomplex*)alpha, + (scomplex*)a, rs_a, cs_a, + (scomplex*)b, rs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if( ( blis_side == BLIS_RIGHT ) && ( m0 != 1 ) ) + { + bli_cscalv_ex + ( + conja, + m0, + (scomplex*)alpha, + (scomplex*)b, rs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + scomplex inva = {1.0, 0.0}; + scomplex a_dup; + /** + * For conjugate transpose and non-unit diagonal + * kernel, negating imaginary part of A. + * As the dimension of A is 1x1, there's going to + * be only one 1 element of A. + */ + if(*transa == 'C' && *diaga == 'N') + { + a_dup.real = a->real; + a_dup.imag = a->imag * -1.0; + } + else + { + a_dup.real = a->real; + a_dup.imag = a->imag; + } + +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_cinvscals(a_dup, inva); +#else + inva.real = a_dup.real; + inva.imag = a_dup.imag; +#endif + + for(dim_t indx = 0; indx < m0; indx ++) + { +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_cscals(inva ,b[indx]) +#else + bli_cinvscals(inva, b[indx]) +#endif + } + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + + } + } + else if( m0 == 1 ) + { + if(blis_side == BLIS_RIGHT) + { + if(bli_is_notrans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_ctrsv_unf_var1 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (scomplex*)alpha, + (scomplex*)a, cs_a, rs_a, + (scomplex*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + else if(bli_is_trans(blis_transa)) + { + if(blis_uploa == BLIS_UPPER) + blis_uploa = BLIS_LOWER; + else + blis_uploa = BLIS_UPPER; + + bli_ctrsv_unf_var2 + ( + blis_uploa, + blis_transa, + blis_diaga, + n0, + (scomplex*)alpha, + (scomplex*)a, cs_a, rs_a, + (scomplex*)b, cs_b, + NULL + ); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + else if(( blis_side == BLIS_LEFT ) && ( n0 != 1 )) + { + bli_cscalv_ex + ( + conja, + n0, + (scomplex*)alpha, + (scomplex*)b, cs_b, + NULL, + NULL + ); + if(blis_diaga == BLIS_NONUNIT_DIAG) + { + scomplex inva = {1.0, 0.0}; + scomplex a_dup; + /** + * For conjugate transpose and non-unit diagonal + * kernel, negating imaginary part of A. + * As the dimension of A is 1x1, there's going to + * be only one 1 element of A. + */ + if(*transa == 'C' && *diaga == 'N') + { + a_dup.real = a->real; + a_dup.imag = a->imag * -1.0; + } + else + { + a_dup.real = a->real; + a_dup.imag = a->imag; + } + +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_cinvscals(a_dup, inva) +#else + inva.real = a_dup.real; + inva.imag = a_dup.imag; +#endif + for(dim_t indx = 0; indx < n0; indx ++) + { +#ifdef BLIS_ENABLE_TRSM_PREINVERSION + bli_cscals(inva ,b[indx * cs_b]) +#else + bli_cinvscals(inva, b[indx * cs_b]) +#endif + + } + } + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + return; + } + } + + const struc_t struca = BLIS_TRIANGULAR; + + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; + obj_t ao = BLIS_OBJECT_INITIALIZER; + obj_t bo = BLIS_OBJECT_INITIALIZER; + + dim_t mn0_a; + + bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); + + bli_obj_init_finish_1x1( dt, (scomplex*)alpha, &alphao ); + + bli_obj_init_finish( dt, mn0_a, mn0_a, (scomplex*)a, rs_a, cs_a, &ao ); + bli_obj_init_finish( dt, m0, n0, (scomplex*)b, rs_b, cs_b, &bo ); + + bli_obj_set_uplo( blis_uploa, &ao ); + bli_obj_set_diag( blis_diaga, &ao ); + bli_obj_set_conjtrans( blis_transa, &ao ); + + bli_obj_set_struc( struca, &ao ); +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM + /* bli_ztrsm_small is performing better existing native + * implementations for [m,n]<=1000 for single thread. + * In case of multithread when [m,n]<=128 sinlge thread implemenation + * is doing better than native multithread */ + bool nt = bli_thread_get_is_parallel(); + if((nt==0 && m0<=1000 && n0<=1000) || + (nt && (m0+n0)<320) ) + { + err_t status; + status = bli_trsm_small + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + if (status == BLIS_SUCCESS) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); + /* Finalize BLIS. */ + bli_finalize_auto(); + return; + } + } +#endif + bli_trsmnat + ( + blis_side, + &alphao, + &ao, + &bo, + NULL, + NULL + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) + /* Finalize BLIS. */ + bli_finalize_auto(); +} +#endif +INSERT_GENTFUNC_BLAS_CZ( trsm, trsm ) + +#endif diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index de9d8339d..7146e8687 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -64,16 +64,7 @@ void bli_sscalv_zen_int10 if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; -#ifdef BLIS_CONFIG_EPYC - bli_ssetv_zen_int - ( - BLIS_NO_CONJUGATE, - n, - zero, - x, incx, - cntx - ); -#else + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( @@ -83,7 +74,7 @@ void bli_sscalv_zen_int10 x, incx, cntx ); -#endif + return; } @@ -342,16 +333,7 @@ void bli_dscalv_zen_int10 if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; -#ifdef BLIS_CONFIG_EPYC - bli_dsetv_zen_int - ( - BLIS_NO_CONJUGATE, - n, - zero, - x, incx, - cntx - ); -#else + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f @@ -362,7 +344,7 @@ void bli_dscalv_zen_int10 x, incx, cntx ); -#endif + return; } diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c index f5a043db8..bb24e6c52 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_4.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -95,29 +95,6 @@ void bli_caxpyf_zen_int_4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - scomplex* a1 = a + (0 )*inca + (i )*lda; - scomplex* chi1 = x + (i )*incx; - scomplex* y1 = y + (0 )*incy; - scomplex alpha_chi1; - - bli_ccopycjs( conjx, *chi1, alpha_chi1 ); - bli_cscals( *alpha, alpha_chi1 ); - - bli_caxpyv_zen_int5 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -141,7 +118,6 @@ void bli_caxpyf_zen_int_4 ); } -#endif return; } @@ -357,28 +333,6 @@ void bli_zaxpyf_zen_int_4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - dcomplex* a1 = a + (0 )*inca + (i )*lda; - dcomplex* chi1 = x + (i )*incx; - dcomplex* y1 = y + (0 )*incy; - dcomplex alpha_chi1; - - bli_zcopycjs( conjx, *chi1, alpha_chi1 ); - bli_zscals( *alpha, alpha_chi1 ); - - bli_zaxpyv_zen_int5 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } -#else zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -402,7 +356,6 @@ void bli_zaxpyf_zen_int_4 ); } -#endif return; } diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c index 112519777..d09a85f57 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_5.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -108,29 +108,6 @@ void bli_saxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - float* a1 = a + (0 )*inca + (i )*lda; - float* chi1 = x + (i )*incx; - float* y1 = y + (0 )*incy; - float alpha_chi1; - - bli_scopycjs( conjx, *chi1, alpha_chi1 ); - bli_sscals( *alpha, alpha_chi1 ); - - bli_saxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -154,7 +131,6 @@ void bli_saxpyf_zen_int_5 ); } -#endif return; } @@ -382,29 +358,6 @@ void bli_daxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; - double alpha_chi1; - - bli_dcopycjs( conjx, *chi1, alpha_chi1 ); - bli_dscals( *alpha, alpha_chi1 ); - - bli_daxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -428,7 +381,6 @@ void bli_daxpyf_zen_int_5 ); } -#endif return; } @@ -655,29 +607,6 @@ static void bli_daxpyf_zen_int_16x2 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; - double alpha_chi1; - - bli_dcopycjs( conjx, *chi1, alpha_chi1 ); - bli_dscals( *alpha, alpha_chi1 ); - - bli_daxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -701,7 +630,6 @@ static void bli_daxpyf_zen_int_16x2 ); } -#endif return; } @@ -966,43 +894,21 @@ void bli_daxpyf_zen_int_16x4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - if(b_n & 2) - { - bli_daxpyf_zen_int_16x2( conja, - conjx, - m, 2, - alpha, a, inca, lda, - x, incx, - y, incy, - cntx - ); - b_n -= 2; - a += 2*lda; - x += 2 * incx; - } - for ( i = 0; i < b_n; ++i ) - { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; - double alpha_chi1; + if (b_n & 2) + { + bli_daxpyf_zen_int_16x2( conja, + conjx, + m, 2, + alpha, a, inca, lda, + x, incx, + y, incy, + cntx + ); + b_n -= 2; + a += 2*lda; + x += 2 * incx; + } - bli_dcopycjs( conjx, *chi1, alpha_chi1 ); - bli_dscals( *alpha, alpha_chi1 ); - - bli_daxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -1026,7 +932,6 @@ void bli_daxpyf_zen_int_16x4 ); } -#endif return; } @@ -1396,29 +1301,6 @@ void bli_caxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - scomplex* a1 = a + (0 )*inca + (i )*lda; - scomplex* chi1 = x + (i )*incx; - scomplex* y1 = y + (0 )*incy; - scomplex alpha_chi1; - - bli_ccopycjs( conjx, *chi1, alpha_chi1 ); - bli_cscals( *alpha, alpha_chi1 ); - - bli_caxpyv_zen_int5 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -1442,7 +1324,6 @@ void bli_caxpyf_zen_int_5 ); } -#endif return; } @@ -1810,29 +1691,6 @@ void bli_zaxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - dcomplex* a1 = a + (0 )*inca + (i )*lda; - dcomplex* chi1 = x + (i )*incx; - dcomplex* y1 = y + (0 )*incy; - dcomplex alpha_chi1; - - bli_zcopycjs( conjx, *chi1, alpha_chi1 ); - bli_zscals( *alpha, alpha_chi1 ); - - bli_zaxpyv_zen_int5 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -1855,8 +1713,7 @@ void bli_zaxpyf_zen_int_5 cntx ); } - -#endif + return; } diff --git a/kernels/zen/1f/bli_axpyf_zen_int_6.c b/kernels/zen/1f/bli_axpyf_zen_int_6.c index 99b544db1..cf7dbd173 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_6.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_6.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -97,28 +97,6 @@ void bli_saxpyf_zen_int_6 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { -#ifdef BLIS_CONFIG_EPYC - for ( i = 0; i < b_n; ++i ) - { - float* a1 = a + (0 )*inca + (i )*lda; - float* chi1 = x + (i )*incx; - float* y1 = y + (0 )*incy; - float alpha_chi1; - - bli_scopycjs( conjx, *chi1, alpha_chi1 ); - bli_sscals( *alpha, alpha_chi1 ); - - bli_saxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } -#else saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) @@ -141,7 +119,7 @@ void bli_saxpyf_zen_int_6 cntx ); } -#endif + return; } diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index bf6c9c29c..3e9463fab 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017-2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -114,16 +114,9 @@ err_t bli_gemm_small AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); return BLIS_NOT_YET_IMPLEMENTED; #else - // When dynamic dispatch is enabled i.e. library is built for 'amdzen' configuration. - // Invoke architecture specific kernels only if we are sure that we are running on zen, - // zen2 or zen3 otherwise fall back to reference kernels (via framework and context). - arch_t id = bli_arch_query_id(); - bool bamdzen = (id == BLIS_ARCH_ZEN4) || - (id == BLIS_ARCH_ZEN3) || - (id == BLIS_ARCH_ZEN2) || - (id == BLIS_ARCH_ZEN); - - if (0 == bamdzen) + // This function is invoked on all architectures including ‘generic’. + // Non-AVX platforms will use the kernels derived from the context. + if (bli_cpuid_is_avx_supported() == FALSE) { return BLIS_NOT_YET_IMPLEMENTED; }