Removed Arch specific code from BLIS framework.

- Removed BLIS_CONFIG_EPYC macro
- The code dependent on this macro is handled in
  one of the three ways

  -- It is updated to work across platforms.
  -- Added in architecture/feature specific runtime checks.
  -- Duplicated in AMD specific files. Build system is updated to
      pick AMD specific files when library is built for any of the
     zen architecture

AMD-Internal: [CPUPL-1960]
Change-Id: I6f9f8018e41fa48eb43ae4245c9c2c361857f43b
This commit is contained in:
Dipal M Zambare
2021-12-20 09:43:13 +05:30
parent 79c6aa5643
commit f63f78d783
53 changed files with 11226 additions and 8028 deletions

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -212,6 +212,27 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \
# Generate object file paths for all of the portable framework source code.
MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))
# AMD has optimized some of the framework files, these optimizations
# may not be compatible with other platforms.
#
# In order to keep main framework code independent of AMD changes,
# AMD has duplicated the files and updated them for example
# frame/compact/bla_gemm.c : generic framework file
# frame/compact/bla_gemm_amd.c : AMD optimized framework file
# Based on the archiecture we choose correct files
ifeq ($(MK_IS_ARCH_ZEN),yes)
# Build is being done for AMD platforms, remove the objects which
# don't have amd suffix (for which exists AMD specific implementation).
MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS))
MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS))
else
# Build is done for non AMD platforms, remove the amd specific objects
MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS))
endif
# Generate object file paths for all of the debgu and trace logger.
MK_AOCLDTL_OBJS := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH))
@@ -1338,4 +1359,3 @@ else
@echo "Uninstalling $(@F) from $(@D)/"
@- $(RM_F) $@
endif

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -204,5 +204,7 @@ MK_ENABLE_AOCL_DYNAMIC := @enable_aocl_dynamic@
# BLAS int size
MK_BLAS_INT_TYPE_SIZE := @blas_int_type_size@
MK_IS_ARCH_ZEN := @enable_aocl_zen@
# end of ifndef CONFIG_MK_INCLUDED conditional block
endif

View File

@@ -4,7 +4,7 @@
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -49,16 +49,6 @@ else
COPTFLAGS := -O3
endif
# This will add BLIS_CONFIG_EPYC for all framework files
# FIXME: framework files should not have architecture specific
# checks at least at compile time. Once the macro
# is defined it is applicable to every build in the
# Family including any non AMD configuration.
# However, it is still better to define it in makefiles
# instead of headers so we can have slighly more
# control on this.
COPTFLAGS += -DBLIS_CONFIG_EPYC
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -46,25 +46,12 @@ AMD_CONFIG_FILE := amd_config.mk
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
# add it here at two places,
# CPPROCFLAGS = This will enable it for framework code
# This flag is used when configure is invoked with specific architecture
# CKOPTFLAGS = This will enable it for architecture specific kernels
# This flag is used for kernels assocaited with this architecture
# irrespective of the configuration it is built for.
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
#
# --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] -----------------------
#
@@ -86,10 +73,6 @@ else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Add this after updating variables for reference kernels
# we don't want this defined for them
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -50,15 +50,7 @@ THIS_CONFIG := zen2
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
# add it here at two places,
# CPPROCFLAGS = This will enable it for framework code
# This flag is used when configure is invoked with specific architecture
# CKOPTFLAGS = This will enable it for architecture specific kernels
# This flag is used for kernels assocaited with this architecture
# irrespective of the configuration it is built for.
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
@@ -111,10 +103,6 @@ endif
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Add this after updating variables for reference kernels
# we don't want this defined for them
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -50,15 +50,7 @@ THIS_CONFIG := zen3
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
# add it here at two places,
# CPPROCFLAGS = This will enable it for framework code
# This flag is used when configure is invoked with specific architecture
# CKOPTFLAGS = This will enable it for architecture specific kernels
# This flag is used for kernels assocaited with this architecture
# irrespective of the configuration it is built for.
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
@@ -132,10 +124,6 @@ endif # gcc
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Add this after updating variables for reference kernels
# we don't want this defined for them
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -4,7 +4,7 @@
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -49,15 +49,7 @@ THIS_CONFIG := zen4
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
# add it here at two places,
# CPPROCFLAGS = This will enable it for framework code
# This flag is used when configure is invoked with specific architecture
# CKOPTFLAGS = This will enable it for architecture specific kernels
# This flag is used for kernels assocaited with this architecture
# irrespective of the configuration it is built for.
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
@@ -131,10 +123,6 @@ endif # gcc
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Add this after updating variables for reference kernels
# we don't want this defined for them
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

3
configure vendored
View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -3370,6 +3370,7 @@ main()
| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \
| sed -e "s/@complex_return@/${complex_return}/g" \
| sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \
> "${config_mk_out_path}"

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -104,357 +104,5 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
void bli_dgemv_unf_var1
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
double *A1;
double *y1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector X.
mem_t mem_bufX;
rntm_t rntm;
double *x_buf = x;
inc_t buf_incx = incx;
bli_init_once();
if (cntx == NULL)
cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans(transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at);
conja = bli_extract_conj(transa);
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(d,type);
double* x1;
double* y1;
PASTECH(d,dotxf_ker_ft) kfp_df;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
kfp_df
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
if (incx > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufX.pblk.buf = NULL;
mem_bufX.pblk.block_size = 0;
mem_bufX.buf_type = 0;
mem_bufX.size = 0;
mem_bufX.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
bli_rntm_init_from_global(&rntm);
bli_rntm_set_num_threads_only(1, &rntm);
bli_membrk_rntm_set_membrk(&rntm);
//calculate the size required for n_elem double elements in vector X.
size_t buffer_size = n_elem * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf("bli_dgemv_unf_var1(): get mem pool block\n");
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufX.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufX);
/*Continue packing X if buffer memory is allocated*/
if ((bli_mem_is_alloc(&mem_bufX)))
{
x_buf = bli_mem_buffer(&mem_bufX);
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
for (dim_t x_index = 0; x_index < n_elem; x_index++)
{
*(x_buf + x_index) = *(x + (x_index * incx));
}
// stride of vector x_buf =1
buf_incx = 1;
}
}
dim_t fuse_factor = 8;
dim_t f_temp =0;
if (n < 4)
{
fuse_factor = 2;
} else if (n < 8)
{
fuse_factor = 4;
}
for (i = 0; i < n_iter; i += f)
{
f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
//A = a + i * row_increment + 0 * column_increment
A1 = a + (i)*rs_at;
y1 = y + (i)*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
switch (f)
{
case 8:
bli_ddotxf_zen_int_8(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
break;
default:
if (f < 4)
{
bli_ddotxf_zen_int_2(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
}
else
{
bli_ddotxf_zen_int_4(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
}
}
f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
if (f_temp < fuse_factor)
{
switch (fuse_factor)
{
case 8:
fuse_factor = 4;
break;
case 4:
fuse_factor = 2;
break;
}
}
}
if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
#endif
// Return the buffer to pool
bli_membrk_release(&rntm, &mem_bufX);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_sgemv_unf_var1
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
float* beta,
float* y, inc_t incy,
cntx_t* cntx
)
{
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_init_once();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(s,type);
float* x1 ;
PASTECH(s,dotxf_ker_ft) kfp_df;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
kfp_df
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
bli_sdotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
}
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
#else
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
#endif

View File

@@ -0,0 +1,440 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
\
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_iter, &n_elem, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (0 )*incy; \
y1 = y + (i )*incy; \
\
/* y1 = beta * y1 + alpha * A1 * x; */ \
kfp_df \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, cs_at, rs_at, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
\
} \
}
void bli_dgemv_unf_var1
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
double *A1;
double *y1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector X.
mem_t mem_bufX;
rntm_t rntm;
double *x_buf = x;
inc_t buf_incx = incx;
bli_init_once();
if (cntx == NULL)
cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans(transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at);
conja = bli_extract_conj(transa);
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(d,type);
double* x1;
double* y1;
PASTECH(d,dotxf_ker_ft) kfp_df;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
kfp_df
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
if (incx > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufX.pblk.buf = NULL;
mem_bufX.pblk.block_size = 0;
mem_bufX.buf_type = 0;
mem_bufX.size = 0;
mem_bufX.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
bli_rntm_init_from_global(&rntm);
bli_rntm_set_num_threads_only(1, &rntm);
bli_membrk_rntm_set_membrk(&rntm);
//calculate the size required for n_elem double elements in vector X.
size_t buffer_size = n_elem * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf("bli_dgemv_unf_var1(): get mem pool block\n");
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufX.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufX);
/*Continue packing X if buffer memory is allocated*/
if ((bli_mem_is_alloc(&mem_bufX)))
{
x_buf = bli_mem_buffer(&mem_bufX);
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
for (dim_t x_index = 0; x_index < n_elem; x_index++)
{
*(x_buf + x_index) = *(x + (x_index * incx));
}
// stride of vector x_buf =1
buf_incx = 1;
}
}
dim_t fuse_factor = 8;
dim_t f_temp =0;
if (n < 4)
{
fuse_factor = 2;
} else if (n < 8)
{
fuse_factor = 4;
}
for (i = 0; i < n_iter; i += f)
{
f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
//A = a + i * row_increment + 0 * column_increment
A1 = a + (i)*rs_at;
y1 = y + (i)*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
switch (f)
{
case 8:
bli_ddotxf_zen_int_8(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
break;
default:
if (f < 4)
{
bli_ddotxf_zen_int_2(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
}
else
{
bli_ddotxf_zen_int_4(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx);
}
}
f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
if (f_temp < fuse_factor)
{
switch (fuse_factor)
{
case 8:
fuse_factor = 4;
break;
case 4:
fuse_factor = 2;
break;
}
}
}
if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
#endif
// Return the buffer to pool
bli_membrk_release(&rntm, &mem_bufX);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_sgemv_unf_var1
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
float* beta,
float* y, inc_t incy,
cntx_t* cntx
)
{
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_init_once();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(s,type);
float* x1 ;
PASTECH(s,dotxf_ker_ft) kfp_df;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
kfp_df
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
bli_sdotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
}
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -137,764 +137,4 @@ void PASTEMAC(ch,varname) \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
}
#ifdef BLIS_CONFIG_EPYC
void bli_dgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
double* A1;
double* x1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double *y_buf = y;
inc_t buf_incy = incy;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(d,type);
double* x1;
double* y1;
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
double* zero = PASTEMAC(d,0);
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
if( bli_deq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
if (incy > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
mem_bufY.buf_type = 0; mem_bufY.size = 0;
mem_bufY.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
bli_rntm_init_from_global( &rntm );
bli_rntm_set_num_threads_only( 1, &rntm );
bli_membrk_rntm_set_membrk( &rntm );
//calculate the size required for n_elem double elements in vector Y.
size_t buffer_size = n_elem * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufY.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY);
/*Continue packing Y if buffer memory is allocated*/
if ((bli_mem_is_alloc( &mem_bufY )))
{
y_buf = bli_mem_buffer(&mem_bufY);
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y_buf + y_index) = *(y + (y_index * incy)) ;
}
// stride of vector y_buf =1
buf_incy = 1;
}
}
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
/* y = y + alpha * A1 * x1; */
bli_daxpyf_zen_int_16x4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y_buf, buf_incy,
NULL
);
}
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
{
//store the result from unit strided y_buf to non-unit strided Y
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y + (y_index * incy)) = *(y_buf + y_index) ;
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_membrk_release(&rntm , &mem_bufY);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_sgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
float* beta,
float* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(s,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(s,eq0)( *beta ) )
{
float* zero = PASTEMAC(s,0);
/* y = 0; */
PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(s,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
if( bli_seq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 6;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_saxpyf_zen_int_6
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_zgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
dcomplex* alpha,
dcomplex* a, inc_t rs_a, inc_t cs_a,
dcomplex* x, inc_t incx,
dcomplex* beta,
dcomplex* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
dcomplex* A1;
dcomplex* x1;
dcomplex* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/* bli_zscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y,
incy,
cntx
);*/
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(z,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(z,eq0)( *beta ) )
{
dcomplex* zero = PASTEMAC(z,0);
/* y = 0; */
PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(z,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
bli_zscalv_ex
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
if( bli_zeq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
!bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
{
// This gemv code deals with the followint conditions only
// 1. incx, incy, and row stride equal to one
// 2. Non conjugate A matrix and X vector
// 3. No Transpose for A Martix
// Rest is taken care by the else part (axpyf implementation)
bli_zgemv_zen_int_4x4
(
conja,
conjx,
m,
n,
alpha,
a, rs_at, cs_at,
x, incx,
beta,
y, incy,
NULL
);
}
else
{
/* fusing factor */
b_fuse = 4;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_zaxpyf_zen_int_4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
NULL
);
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_cgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
scomplex* alpha,
scomplex* a, inc_t rs_a, inc_t cs_a,
scomplex* x, inc_t incx,
scomplex* beta,
scomplex* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
scomplex* A1;
scomplex* x1;
scomplex* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/*bli_cscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y,
incy,
cntx
);*/
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
const num_t dt = PASTEMAC(c,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(c,eq0)( *beta ) )
{
scomplex* zero = PASTEMAC(c,0);
/* y = 0; */
PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(c,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
bli_cscalv_ex
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
if( bli_ceq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
!bli_is_conj(conja) && !bli_is_conj(conjx) &&
!bli_is_trans(transa))
{
// This gemv code deals with the followint conditions only
// 1. incx, incy, and row stride equal to one
// 2. Non conjugate A matrix and X vector
// 3. No Transpose for A Martix
// Rest is taken care by the else part (axpyf implementation)
bli_cgemv_zen_int_4x4
(
conja,
conjx,
m,
n,
alpha,
a, rs_at, cs_at,
x, incx,
beta,
y, incy,
NULL
);
}
else
{
/* fusing factor. */
b_fuse = 4;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_caxpyf_zen_int_4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
NULL
);
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
#else
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
#endif
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )

View File

@@ -0,0 +1,879 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define BLIS_DGEMV_VAR2_FUSE 4
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); \
\
bli_init_once(); \
\
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_elem, &n_iter, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* y = y + alpha * A1 * x1; */ \
kfp_af \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, rs_at, cs_at, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
}
void bli_dgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
double* A1;
double* x1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double *y_buf = y;
inc_t buf_incy = incy;
// For AMD these APIS are invoked skipping intermediate framework layers
// Hence we need to ensure that cntx is set here.
bli_init_once();
if(cntx == NULL) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
const num_t dt = PASTEMAC(d,type);
double* x1;
double* y1;
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
double* zero = PASTEMAC(d,0);
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx
);
if( bli_deq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
if (incy > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
mem_bufY.buf_type = 0; mem_bufY.size = 0;
mem_bufY.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
bli_rntm_init_from_global( &rntm );
bli_rntm_set_num_threads_only( 1, &rntm );
bli_membrk_rntm_set_membrk( &rntm );
//calculate the size required for n_elem double elements in vector Y.
size_t buffer_size = n_elem * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufY.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY);
/*Continue packing Y if buffer memory is allocated*/
if ((bli_mem_is_alloc( &mem_bufY )))
{
y_buf = bli_mem_buffer(&mem_bufY);
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y_buf + y_index) = *(y + (y_index * incy)) ;
}
// stride of vector y_buf =1
buf_incy = 1;
}
}
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
/* y = y + alpha * A1 * x1; */
bli_daxpyf_zen_int_16x4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y_buf, buf_incy,
cntx
);
}
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
{
//store the result from unit strided y_buf to non-unit strided Y
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y + (y_index * incy)) = *(y_buf + y_index) ;
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_membrk_release(&rntm , &mem_bufY);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_sgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
float* beta,
float* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
// For AMD these APIS are invoked skipping intermediate framework layers
// Hence we need to ensure that cntx is set here.
bli_init_once();
if(cntx == NULL) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
const num_t dt = PASTEMAC(s,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(s,eq0)( *beta ) )
{
float* zero = PASTEMAC(s,0);
/* y = 0; */
PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(s,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx
);
if( bli_seq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 6;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_saxpyf_zen_int_6
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_zgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
dcomplex* alpha,
dcomplex* a, inc_t rs_a, inc_t cs_a,
dcomplex* x, inc_t incx,
dcomplex* beta,
dcomplex* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
dcomplex* A1;
dcomplex* x1;
dcomplex* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
// For AMD these APIS are invoked skipping intermediate framework layers
// Hence we need to ensure that cntx is set here.
bli_init_once();
if(cntx == NULL) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/* bli_zscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y,
incy,
cntx
);*/
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
const num_t dt = PASTEMAC(z,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(z,eq0)( *beta ) )
{
dcomplex* zero = PASTEMAC(z,0);
/* y = 0; */
PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(z,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
bli_zscalv_ex
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
if( bli_zeq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
!bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
{
// This gemv code deals with the followint conditions only
// 1. incx, incy, and row stride equal to one
// 2. Non conjugate A matrix and X vector
// 3. No Transpose for A Martix
// Rest is taken care by the else part (axpyf implementation)
bli_zgemv_zen_int_4x4
(
conja,
conjx,
m,
n,
alpha,
a, rs_at, cs_at,
x, incx,
beta,
y, incy,
cntx
);
}
else
{
/* fusing factor */
b_fuse = 4;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_zaxpyf_zen_int_4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_cgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
scomplex* alpha,
scomplex* a, inc_t rs_a, inc_t cs_a,
scomplex* x, inc_t incx,
scomplex* beta,
scomplex* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
scomplex* A1;
scomplex* x1;
scomplex* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
// For AMD these APIS are invoked skipping intermediate framework layers
// Hence we need to ensure that cntx is set here.
bli_init_once();
if(cntx == NULL) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/*bli_cscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y,
incy,
cntx
);*/
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
const num_t dt = PASTEMAC(c,type);
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(c,eq0)( *beta ) )
{
scomplex* zero = PASTEMAC(c,0);
/* y = 0; */
PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(c,axpyf_ker_ft) kfp_af;
/* Query the context for the kernel function pointer and fusing factor. */
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
kfp_af
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
bli_cscalv_ex
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
cntx,
NULL
);
if( bli_ceq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
!bli_is_conj(conja) && !bli_is_conj(conjx) &&
!bli_is_trans(transa))
{
// This gemv code deals with the followint conditions only
// 1. incx, incy, and row stride equal to one
// 2. Non conjugate A matrix and X vector
// 3. No Transpose for A Martix
// Rest is taken care by the else part (axpyf implementation)
bli_cgemv_zen_int_4x4
(
conja,
conjx,
m,
n,
alpha,
a, rs_at, cs_at,
x, incx,
beta,
y, incy,
cntx
);
}
else
{
/* fusing factor. */
b_fuse = 4;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_caxpyf_zen_int_4
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
cntx
);
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -216,207 +216,5 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
void bli_post_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var1
(
uplo_t uplo,
conj_t conja,
conj_t conjx,
conj_t conjh,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* one = PASTEMAC(d,1);
double* zero = PASTEMAC(d,0);
double* A10;
double* A11;
double* a10t;
double* alpha11;
double* a21;
double* x0;
double* x1;
double* chi11;
double* y0;
double* y1;
double* y01;
double* psi11;
double* y21;
double conjx_chi11;
double alpha_chi11;
double alpha11_temp;
dim_t i, k, j;
dim_t b_fuse, f;
dim_t n_behind;
dim_t f_ahead, f_behind;
inc_t rs_at, cs_at;
conj_t conj0 = 0, conj1 = 0;
/* The algorithm will be expressed in terms of the lower triangular
* case;the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters. */
if ( bli_is_lower( uplo ) )
{
rs_at = rs_a;
cs_at = cs_a;
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
/* Query the context for the kernel function pointer and fusing
* factor. */
/* Assign kernel function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = ((id == BLIS_ARCH_ZEN4) ||(id == BLIS_ARCH_ZEN3)
|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
if (bamdzen)
{
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_dotxaxpyf_ker =
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
b_fuse =
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
}
for ( i = 0; i < m; i += f )
{
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
n_behind = i;
A10 = a + (i )*rs_at + (0 )*cs_at;
A11 = a + (i )*rs_at + (i )*cs_at;
x0 = x + (0 )*incx;
x1 = x + (i )*incx;
y0 = y + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = y1 + alpha * A10 * x0; (dotxf) */
/* y0 = y0 + alpha * A10' * x1; (axpyf) */
kfp_dotxaxpyf_ker
(
conj0,
conj1,
conjx,
conjx,
n_behind,
f,
alpha,
A10, cs_at, rs_at,
x0, incx,
x1, incx,
one,
y1, incy,
y0, incy,
cntx
);
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
{
/*this helper function handles unit stride only*/
bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
}
else
{
for ( k = 0; k < f; ++k )
{
f_behind = k;
f_ahead = f - k - 1;
a10t = A11 + (k )*rs_at + (0 )*cs_at;
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
chi11 = x1 + (k )*incx;
y01 = y1 + (0 )*incy;
psi11 = y1 + (k )*incy;
y21 = y1 + (k+1)*incy;
/* y01 = y01 + alpha * a10t' * chi11; */
PASTEMAC(d,copycjs)( conjx, *chi11,
conjx_chi11 );
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
alpha_chi11 );
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,axpys)( alpha_chi11,
*(a10t + j*cs_at),
*(y01 + j*incy) );
PASTEMAC(d,copycjs)( conja, *alpha11,
alpha11_temp );
/* psi11 = psi11 + alpha * alpha11 * chi11; */
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
*psi11 );
/* y21 = y21 + alpha * a21 * chi11; */
for ( j = 0; j < f_ahead; ++j )
{
PASTEMAC(d,axpys)( alpha_chi11,
*(a21 + j*rs_at),
*(y21 + j*incy) );
}
}
}
}
}
GENTFUNC(float, s, hemv_unf_var1)
GENTFUNC(scomplex, c, hemv_unf_var1)
GENTFUNC(dcomplex, z, hemv_unf_var1)
#else
INSERT_GENTFUNC_BASIC0( hemv_unf_var1 )
#endif

View File

@@ -0,0 +1,418 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* A10; \
ctype* A11; \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* x1; \
ctype* chi11; \
ctype* y0; \
ctype* y1; \
ctype* y01; \
ctype* psi11; \
ctype* y21; \
ctype conjx_chi11; \
ctype alpha_chi11; \
ctype alpha11_temp; \
dim_t i, k, j; \
dim_t b_fuse, f; \
dim_t n_behind; \
dim_t f_ahead, f_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_behind = i; \
A10 = a + (i )*rs_at + (0 )*cs_at; \
A11 = a + (i )*rs_at + (i )*cs_at; \
x0 = x + (0 )*incx; \
x1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
y1 = y + (i )*incy; \
\
/* y1 = y1 + alpha * A10 * x0; (dotxf) */ \
/* y0 = y0 + alpha * A10' * x1; (axpyf) */ \
kfp_xf \
( \
conj0, \
conj1, \
conjx, \
conjx, \
n_behind, \
f, \
alpha, \
A10, cs_at, rs_at, \
x0, incx, \
x1, incx, \
one, \
y1, incy, \
y0, incy, \
cntx \
); \
\
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
for ( k = 0; k < f; ++k ) \
{ \
f_behind = k; \
f_ahead = f - k - 1; \
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
chi11 = x1 + (k )*incx; \
y01 = y1 + (0 )*incy; \
psi11 = y1 + (k )*incy; \
y21 = y1 + (k+1)*incy; \
\
/* y01 = y01 + alpha * a10t' * chi11; */ \
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
if ( bli_is_conj( conj1 ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
\
/* y21 = y21 + alpha * a21 * chi11; */ \
if ( bli_is_conj( conj0 ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
} \
} \
}
void bli_post_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var1
(
uplo_t uplo,
conj_t conja,
conj_t conjx,
conj_t conjh,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* one = PASTEMAC(d,1);
double* zero = PASTEMAC(d,0);
double* A10;
double* A11;
double* a10t;
double* alpha11;
double* a21;
double* x0;
double* x1;
double* chi11;
double* y0;
double* y1;
double* y01;
double* psi11;
double* y21;
double conjx_chi11;
double alpha_chi11;
double alpha11_temp;
dim_t i, k, j;
dim_t b_fuse, f;
dim_t n_behind;
dim_t f_ahead, f_behind;
inc_t rs_at, cs_at;
conj_t conj0 = 0, conj1 = 0;
/* The algorithm will be expressed in terms of the lower triangular
* case;the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters. */
if ( bli_is_lower( uplo ) )
{
rs_at = rs_a;
cs_at = cs_a;
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
/* Query the context for the kernel function pointer and fusing
* factor. */
/* Assign kernel function pointer and fusing factor. */
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_dotxaxpyf_ker =
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
b_fuse =
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
}
for ( i = 0; i < m; i += f )
{
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
n_behind = i;
A10 = a + (i )*rs_at + (0 )*cs_at;
A11 = a + (i )*rs_at + (i )*cs_at;
x0 = x + (0 )*incx;
x1 = x + (i )*incx;
y0 = y + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = y1 + alpha * A10 * x0; (dotxf) */
/* y0 = y0 + alpha * A10' * x1; (axpyf) */
kfp_dotxaxpyf_ker
(
conj0,
conj1,
conjx,
conjx,
n_behind,
f,
alpha,
A10, cs_at, rs_at,
x0, incx,
x1, incx,
one,
y1, incy,
y0, incy,
cntx
);
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
{
/*this helper function handles unit stride only*/
bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
}
else
{
for ( k = 0; k < f; ++k )
{
f_behind = k;
f_ahead = f - k - 1;
a10t = A11 + (k )*rs_at + (0 )*cs_at;
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
chi11 = x1 + (k )*incx;
y01 = y1 + (0 )*incy;
psi11 = y1 + (k )*incy;
y21 = y1 + (k+1)*incy;
/* y01 = y01 + alpha * a10t' * chi11; */
PASTEMAC(d,copycjs)( conjx, *chi11,
conjx_chi11 );
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
alpha_chi11 );
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,axpys)( alpha_chi11,
*(a10t + j*cs_at),
*(y01 + j*incy) );
PASTEMAC(d,copycjs)( conja, *alpha11,
alpha11_temp );
/* psi11 = psi11 + alpha * alpha11 * chi11; */
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
*psi11 );
/* y21 = y21 + alpha * a21 * chi11; */
for ( j = 0; j < f_ahead; ++j )
{
PASTEMAC(d,axpys)( alpha_chi11,
*(a21 + j*rs_at),
*(y21 + j*incy) );
}
}
}
}
}
GENTFUNC(float, s, hemv_unf_var1)
GENTFUNC(scomplex, c, hemv_unf_var1)
GENTFUNC(dcomplex, z, hemv_unf_var1)

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -216,210 +216,6 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
void bli_pre_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var3
(
uplo_t uplo,
conj_t conja,
conj_t conjx,
conj_t conjh,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* one = PASTEMAC(d,1);
double* zero = PASTEMAC(d,0);
double* A11;
double* A21;
double* a10t;
double* alpha11;
double* a21;
double* x1;
double* x2;
double* chi11;
double* y1;
double* y2;
double* y01;
double* psi11;
double* y21;
double conjx_chi11;
double alpha_chi11;
double alpha11_temp;
dim_t i, k, j;
dim_t b_fuse, f;
dim_t n_ahead;
dim_t f_ahead, f_behind;
inc_t rs_at, cs_at;
conj_t conj0 = 0, conj1 = 0;
/* The algorithm will be expressed in terms of the lower triangular
* case; the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters. */
if ( bli_is_lower( uplo ) )
{
rs_at = rs_a;
cs_at = cs_a;
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
arch_t id = bli_arch_query_id();
bool bamdzen = ((id == BLIS_ARCH_ZEN4) || (id == BLIS_ARCH_ZEN3)
|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
if (bamdzen)
{
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_dotxaxpyf_ker =
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
b_fuse =
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
}
for ( i = 0; i < m; i += f )
{
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
n_ahead = m - i - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
y1 = y + (i )*incy;
y2 = y + (i+f)*incy;
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
{
/*this helper function handles unit stride only*/
bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
}
else
{
for ( k = 0; k < f; ++k )
{
f_behind = k;
f_ahead = f - k - 1;
a10t = A11 + (k )*rs_at + (0 )*cs_at;
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
chi11 = x1 + (k )*incx;
y01 = y1 + (0 )*incy;
psi11 = y1 + (k )*incy;
y21 = y1 + (k+1)*incy;
/* y01 = y01 + alpha * a10t' * chi11; */
PASTEMAC(d,copycjs)( conjx,
*chi11, conjx_chi11 );
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
alpha_chi11 );
{
for ( j = 0; j < f_behind; ++j )
{
PASTEMAC(d,axpys)
( alpha_chi11,
*(a10t + j*cs_at),
*(y01 + j*incy) );
}
}
PASTEMAC(d,copycjs)( conja, *alpha11,
alpha11_temp );
/* psi11 = psi11 + alpha * alpha11 * chi11; */
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
*psi11 );
/* y21 = y21 + alpha * a21 * chi11; */
for ( j = 0; j < f_ahead; ++j )
{
PASTEMAC(d,axpys)( alpha_chi11,
*(a21 + j*rs_at),
*(y21 + j*incy) );
}
}
}
/* y1 = y1 + alpha * A21' * x2; (dotxf) */
/* y2 = y2 + alpha * A21 * x1; (axpyf) */
kfp_dotxaxpyf_ker
(
conj0,
conj1,
conjx,
conjx,
n_ahead,
f,
alpha,
A21, rs_at, cs_at,
x2, incx,
x1, incx,
one,
y1, incy,
y2, incy,
cntx
);
}
}
GENTFUNC(float, s, hemv_unf_var3)
GENTFUNC(scomplex, c, hemv_unf_var3)
GENTFUNC(dcomplex, z, hemv_unf_var3)
#else
INSERT_GENTFUNC_BASIC0( hemv_unf_var3 )
#endif

View File

@@ -0,0 +1,420 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* A11; \
ctype* A21; \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* x1; \
ctype* x2; \
ctype* chi11; \
ctype* y1; \
ctype* y2; \
ctype* y01; \
ctype* psi11; \
ctype* y21; \
ctype conjx_chi11; \
ctype alpha_chi11; \
ctype alpha11_temp; \
dim_t i, k, j; \
dim_t b_fuse, f; \
dim_t n_ahead; \
dim_t f_ahead, f_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_ahead = m - i - f; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A21 = a + (i+f)*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
y1 = y + (i )*incy; \
y2 = y + (i+f)*incy; \
\
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
for ( k = 0; k < f; ++k ) \
{ \
f_behind = k; \
f_ahead = f - k - 1; \
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
chi11 = x1 + (k )*incx; \
y01 = y1 + (0 )*incy; \
psi11 = y1 + (k )*incy; \
y21 = y1 + (k+1)*incy; \
\
/* y01 = y01 + alpha * a10t' * chi11; */ \
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
if ( bli_is_conj( conj0 ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
\
/* y21 = y21 + alpha * a21 * chi11; */ \
if ( bli_is_conj( conj1 ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
} \
\
/* y1 = y1 + alpha * A21' * x2; (dotxf) */ \
/* y2 = y2 + alpha * A21 * x1; (axpyf) */ \
kfp_xf \
( \
conj0, \
conj1, \
conjx, \
conjx, \
n_ahead, \
f, \
alpha, \
A21, rs_at, cs_at, \
x2, incx, \
x1, incx, \
one, \
y1, incy, \
y2, incy, \
cntx \
); \
} \
}
void bli_pre_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var3
(
uplo_t uplo,
conj_t conja,
conj_t conjx,
conj_t conjh,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* one = PASTEMAC(d,1);
double* zero = PASTEMAC(d,0);
double* A11;
double* A21;
double* a10t;
double* alpha11;
double* a21;
double* x1;
double* x2;
double* chi11;
double* y1;
double* y2;
double* y01;
double* psi11;
double* y21;
double conjx_chi11;
double alpha_chi11;
double alpha11_temp;
dim_t i, k, j;
dim_t b_fuse, f;
dim_t n_ahead;
dim_t f_ahead, f_behind;
inc_t rs_at, cs_at;
conj_t conj0 = 0, conj1 = 0;
/* The algorithm will be expressed in terms of the lower triangular
* case; the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters. */
if ( bli_is_lower( uplo ) )
{
rs_at = rs_a;
cs_at = cs_a;
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
}
/* If beta is zero, use setv. Otherwise, scale by beta. */
if ( PASTEMAC(d,eq0)( *beta ) )
{
/* y = 0; */
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
zero,
y, incy,
cntx,
NULL
);
}
else
{
/* y = beta * y; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
beta,
y, incy,
cntx,
NULL
);
}
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_dotxaxpyf_ker =
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
b_fuse =
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
}
for ( i = 0; i < m; i += f )
{
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
n_ahead = m - i - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
y1 = y + (i )*incy;
y2 = y + (i+f)*incy;
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
{
/*this helper function handles unit stride only*/
bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
}
else
{
for ( k = 0; k < f; ++k )
{
f_behind = k;
f_ahead = f - k - 1;
a10t = A11 + (k )*rs_at + (0 )*cs_at;
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
chi11 = x1 + (k )*incx;
y01 = y1 + (0 )*incy;
psi11 = y1 + (k )*incy;
y21 = y1 + (k+1)*incy;
/* y01 = y01 + alpha * a10t' * chi11; */
PASTEMAC(d,copycjs)( conjx,
*chi11, conjx_chi11 );
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
alpha_chi11 );
{
for ( j = 0; j < f_behind; ++j )
{
PASTEMAC(d,axpys)
( alpha_chi11,
*(a10t + j*cs_at),
*(y01 + j*incy) );
}
}
PASTEMAC(d,copycjs)( conja, *alpha11,
alpha11_temp );
/* psi11 = psi11 + alpha * alpha11 * chi11; */
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
*psi11 );
/* y21 = y21 + alpha * a21 * chi11; */
for ( j = 0; j < f_ahead; ++j )
{
PASTEMAC(d,axpys)( alpha_chi11,
*(a21 + j*rs_at),
*(y21 + j*incy) );
}
}
}
/* y1 = y1 + alpha * A21' * x2; (dotxf) */
/* y2 = y2 + alpha * A21 * x1; (axpyf) */
kfp_dotxaxpyf_ker
(
conj0,
conj1,
conjx,
conjx,
n_ahead,
f,
alpha,
A21, rs_at, cs_at,
x2, incx,
x1, incx,
one,
y1, incy,
y2, incy,
cntx
);
}
}
GENTFUNC(float, s, hemv_unf_var3)
GENTFUNC(scomplex, c, hemv_unf_var3)
GENTFUNC(dcomplex, z, hemv_unf_var3)

View File

@@ -158,217 +158,5 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_trans_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var1
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* x0;
double* chi1;
double* y0;
double* psi1;
double* c10t;
double* gamma11;
double alpha0;
double alpha1;
double alpha0_chi1;
double alpha1_psi1;
double alpha0_chi1_psi1;
double conjx0_chi1;
double conjy1_psi1;
double conjy0_psi1;
dim_t i;
dim_t n_behind;
inc_t rs_ct, cs_ct;
conj_t conj0, conj1;
/* The algorithm will be expressed in terms of the lower triangular
* case;the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
conjx = bli_apply_conj( conjh, conjx );
conjy = bli_apply_conj( conjh, conjy );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
PASTEMAC(d,copys)( *alpha, alpha1 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
conj0 = bli_apply_conj( conjh, conjy );
conj1 = bli_apply_conj( conjh, conjx );
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
if((n_behind >= 3))
{
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
i+=4;
}
else
{
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var1)
GENTFUNC(scomplex, c, her2_unf_var1)
GENTFUNC(dcomplex, z,her2_unf_var1)
#else
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
#endif

View File

@@ -0,0 +1,369 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x0; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype* c10t; \
ctype* gamma11; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_chi1; \
ctype alpha1_psi1; \
ctype alpha0_chi1_psi1; \
ctype conjx0_chi1; \
ctype conjy1_psi1; \
ctype conjy0_psi1; \
dim_t i; \
dim_t n_behind; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = bli_apply_conj( conjh, conjy ); \
conj1 = bli_apply_conj( conjh, conjx ); \
\
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
\
/* Query the context for the kernel function pointer. */ \
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \
PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
\
/* c10t = c10t + alpha * chi1 * y0'; */ \
/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
kfp_2v \
( \
conj0, \
conj1, \
n_behind, \
&alpha0_chi1, \
&alpha1_psi1, \
y0, incy, \
x0, incx, \
c10t, cs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_trans_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var1
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* x0;
double* chi1;
double* y0;
double* psi1;
double* c10t;
double* gamma11;
double alpha0;
double alpha1;
double alpha0_chi1;
double alpha1_psi1;
double alpha0_chi1_psi1;
double conjx0_chi1;
double conjy1_psi1;
double conjy0_psi1;
dim_t i;
dim_t n_behind;
inc_t rs_ct, cs_ct;
conj_t conj0, conj1;
/* The algorithm will be expressed in terms of the lower triangular
* case;the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
conjx = bli_apply_conj( conjh, conjx );
conjy = bli_apply_conj( conjh, conjy );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
PASTEMAC(d,copys)( *alpha, alpha1 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
conj0 = bli_apply_conj( conjh, conjy );
conj1 = bli_apply_conj( conjh, conjx );
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
if((n_behind >= 3))
{
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
i+=4;
}
else
{
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var1)
GENTFUNC(scomplex, c, her2_unf_var1)
GENTFUNC(dcomplex, z,her2_unf_var1)

View File

@@ -166,192 +166,5 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var4
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
double* chi1;
double* x2;
double* psi1;
double* y2;
double* gamma11;
double* c21;
double alpha0;
double alpha0_psi1;
double alpha1_chi1;
double alpha0_chi1_psi1;
dim_t i;
dim_t n_ahead;
inc_t rs_ct, cs_ct;
const num_t dt = PASTEMAC(d,type);
/* The algorithm will be expressed in terms of the lower triangular
* case; the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
PASTEMAC(d,copys)( *alpha, alpha0 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if((incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
if((n_ahead >= 3))
{
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
i+= 4;
}
else
{
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i)
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var4)
GENTFUNC(scomplex, c, her2_unf_var4)
GENTFUNC(dcomplex, z,her2_unf_var4)
#else
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
#endif

View File

@@ -0,0 +1,354 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* y2; \
ctype* gamma11; \
ctype* c21; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_psi1; \
ctype alpha1_chi1; \
ctype alpha0_chi1_psi1; \
ctype conjy0_psi1; \
ctype conjx1_chi1; \
ctype conjx0_chi1; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
conj_t conjh_conjx; \
conj_t conjh_conjy; \
\
/* Eliminate unused variable warnings. */ \
( void )conjh_conjx; \
( void )conjh_conjy; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = conjx; \
conj1 = conjy; \
conjh_conjx = bli_apply_conj( conjh, conjx ); \
conjh_conjy = bli_apply_conj( conjh, conjy ); \
\
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
\
/* Query the context for the kernel function pointer. */ \
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
\
/* c21 = c21 + alpha * x2 * conj(psi1); */ \
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
kfp_2v \
( \
conj0, \
conj1, \
n_ahead, \
&alpha0_psi1, \
&alpha1_chi1, \
x2, incx, \
y2, incy, \
c21, rs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var4
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
double* chi1;
double* x2;
double* psi1;
double* y2;
double* gamma11;
double* c21;
double alpha0;
double alpha0_psi1;
double alpha1_chi1;
double alpha0_chi1_psi1;
dim_t i;
dim_t n_ahead;
inc_t rs_ct, cs_ct;
const num_t dt = PASTEMAC(d,type);
/* The algorithm will be expressed in terms of the lower triangular
* case; the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
PASTEMAC(d,copys)( *alpha, alpha0 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if((incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
if((n_ahead >= 3))
{
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
i+= 4;
}
else
{
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i)
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var4)
GENTFUNC(scomplex, c, her2_unf_var4)
GENTFUNC(dcomplex, z,her2_unf_var4)

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -231,421 +231,4 @@ void PASTEMAC(ch,varname) \
} \
}
#ifdef BLIS_CONFIG_EPYC
void bli_dtrsv_unf_var1
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
cntx_t* cntx
)
{
double* one = PASTEMAC(d,1);
double* minus_one = PASTEMAC(d,m1);
double* A10;
double* A11;
double* A12;
double* a10t;
double* alpha11;
double* a12t;
double* x0;
double* x1;
double* x2;
double* x01;
double* chi11;
double* x21;
double alpha11_conj;
double rho1;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_behind, f_behind;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(d,dotxf_ker_ft) kfp_df;
/* Assign kernel function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_df = bli_ddotxf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
num_t dt = PASTEMAC(d,type);
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_behind = iter;
A11 = a + (i )*rs_at + (i )*cs_at;
A12 = a + (i )*rs_at + (i+f)*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 - A12 * x2; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A12, cs_at, rs_at,
x2, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_behind = k;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 - a12t * x21; */
PASTEMAC(d,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
PASTEMAC(d,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
}
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_behind = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A10 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 - A10 * x0; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A10, cs_at, rs_at,
x0, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_behind = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a10t = A11 + (l )*rs_at + (0 )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 - a10t * x01; */
PASTEMAC(d,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
PASTEMAC(d,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
}
}
}
}
void bli_strsv_unf_var1
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
cntx_t* cntx
)
{
float* one = PASTEMAC(s,1);
float* minus_one = PASTEMAC(s,m1);
float* A10;
float* A11;
float* A12;
float* a10t;
float* alpha11;
float* a12t;
float* x0;
float* x1;
float* x2;
float* x01;
float* chi11;
float* x21;
float alpha11_conj;
float rho1;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_behind, f_behind;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(s,dotxf_ker_ft) kfp_df;
/* Assign kernel function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_df = bli_sdotxf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
num_t dt = PASTEMAC(s,type);
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_behind = iter;
A11 = a + (i )*rs_at + (i )*cs_at;
A12 = a + (i )*rs_at + (i+f)*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 - A12 * x2; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A12, cs_at, rs_at,
x2, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_behind = k;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 - a12t * x21; */
PASTEMAC(s,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
PASTEMAC(s,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
}
}
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_behind = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A10 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 - A10 * x0; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A10, cs_at, rs_at,
x0, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_behind = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a10t = A11 + (l )*rs_at + (0 )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 - a10t * x01; */
PASTEMAC(s,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
PASTEMAC(s,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
}
}
}
}
}
INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )
#else
INSERT_GENTFUNC_BASIC0( trsv_unf_var1 )
#endif

View File

@@ -0,0 +1,638 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* minus_one = PASTEMAC(ch,m1); \
ctype* A10; \
ctype* A11; \
ctype* A12; \
ctype* a10t; \
ctype* alpha11; \
ctype* a12t; \
ctype* x0; \
ctype* x1; \
ctype* x2; \
ctype* x01; \
ctype* chi11; \
ctype* x21; \
ctype alpha11_conj; \
ctype rho1; \
dim_t iter, i, k, j, l; \
dim_t b_fuse, f; \
dim_t n_behind, f_behind; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
/* x = alpha * x; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_behind = iter; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A12 = a + (i )*rs_at + (i+f)*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
\
/* x1 = x1 - A12 * x2; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
minus_one, \
A12, cs_at, rs_at, \
x2, incx, \
one, \
x1, incx, \
cntx \
); \
\
/* x1 = x1 / triu( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = f - k - 1; \
f_behind = k; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a12t = A11 + (l )*rs_at + (l+1)*cs_at; \
chi11 = x1 + (l )*incx; \
x21 = x1 + (l+1)*incx; \
\
/* chi11 = chi11 - a12t * x21; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
PASTEMAC(ch,subs)( rho1, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
} \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_behind = i; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A10 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x1 = x1 - A10 * x0; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
minus_one, \
A10, cs_at, rs_at, \
x0, incx, \
one, \
x1, incx, \
cntx \
); \
\
/* x1 = x1 / tril( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = k; \
f_behind = l; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a10t = A11 + (l )*rs_at + (0 )*cs_at; \
chi11 = x1 + (l )*incx; \
x01 = x1 + (0 )*incx; \
\
/* chi11 = chi11 - a10t * x01; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
PASTEMAC(ch,subs)( rho1, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
} \
} \
} \
}
void bli_dtrsv_unf_var1
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
cntx_t* cntx
)
{
double* one = PASTEMAC(d,1);
double* minus_one = PASTEMAC(d,m1);
double* A10;
double* A11;
double* A12;
double* a10t;
double* alpha11;
double* a12t;
double* x0;
double* x1;
double* x2;
double* x01;
double* chi11;
double* x21;
double alpha11_conj;
double rho1;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_behind, f_behind;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(d,dotxf_ker_ft) kfp_df;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE) {
kfp_df = bli_ddotxf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
num_t dt = PASTEMAC(d,type);
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_behind = iter;
A11 = a + (i )*rs_at + (i )*cs_at;
A12 = a + (i )*rs_at + (i+f)*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 - A12 * x2; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A12, cs_at, rs_at,
x2, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_behind = k;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 - a12t * x21; */
PASTEMAC(d,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
PASTEMAC(d,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
}
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_behind = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A10 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 - A10 * x0; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A10, cs_at, rs_at,
x0, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_behind = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a10t = A11 + (l )*rs_at + (0 )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 - a10t * x01; */
PASTEMAC(d,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
PASTEMAC(d,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
}
}
}
}
void bli_strsv_unf_var1
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
cntx_t* cntx
)
{
float* one = PASTEMAC(s,1);
float* minus_one = PASTEMAC(s,m1);
float* A10;
float* A11;
float* A12;
float* a10t;
float* alpha11;
float* a12t;
float* x0;
float* x1;
float* x2;
float* x01;
float* chi11;
float* x21;
float alpha11_conj;
float rho1;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_behind, f_behind;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(s,dotxf_ker_ft) kfp_df;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE) {
kfp_df = bli_sdotxf_zen_int_8;
b_fuse = 8;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
num_t dt = PASTEMAC(s,type);
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_behind = iter;
A11 = a + (i )*rs_at + (i )*cs_at;
A12 = a + (i )*rs_at + (i+f)*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 - A12 * x2; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A12, cs_at, rs_at,
x2, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_behind = k;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 - a12t * x21; */
PASTEMAC(s,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
}
PASTEMAC(s,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
}
}
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_behind = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A10 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 - A10 * x0; */
kfp_df
(
conja,
BLIS_NO_CONJUGATE,
n_behind,
f,
minus_one,
A10, cs_at, rs_at,
x0, incx,
one,
x1, incx,
cntx
);
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_behind = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a10t = A11 + (l )*rs_at + (0 )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 - a10t * x01; */
PASTEMAC(s,set0s)( rho1 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
else
{
for ( j = 0; j < f_behind; ++j )
PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
}
PASTEMAC(s,subs)( rho1, *chi11 );
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
}
}
}
}
}
INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -228,805 +228,5 @@ void PASTEMAC(ch,varname) \
} \
} \
}
#ifdef BLIS_CONFIG_EPYC
void bli_dtrsv_unf_var2
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
cntx_t* cntx
)
{
double* minus_one = PASTEMAC(d,m1);
double* A01;
double* A11;
double* A21;
double* a01;
double* alpha11;
double* a21;
double* x0;
double* x1;
double* x2;
double* x01;
double* chi11;
double* x21;
double alpha11_conj;
double minus_chi11;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_ahead, f_ahead;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if ( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(d,axpyf_ker_ft) kfp_af;
/* Assign kernel function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_af = bli_daxpyf_zen_int_16x4;
b_fuse = 4;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_ahead = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A01 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_ahead = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a01 = A11 + (0 )*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
/* x01 = x01 - chi11 * a01; */
PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(d,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(d,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
}
/* x0 = x0 - A01 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A01, rs_at, cs_at,
x1, incx,
x0, incx,
cntx
);
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_ahead = m - iter - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_ahead = f - k - 1;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
}
/* x21 = x21 - chi11 * a21; */
PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(d,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(d,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
}
/* x2 = x2 - A21 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A21, rs_at, cs_at,
x1, incx,
x2, incx,
cntx
);
}
}
}
void bli_strsv_unf_var2
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
float* alpha,
float* a, inc_t rs_a, inc_t cs_a,
float* x, inc_t incx,
cntx_t* cntx
)
{
float* minus_one = PASTEMAC(s, m1);
float* A01;
float* A11;
float* A21;
float* a01;
float* alpha11;
float* a21;
float* x0;
float* x1;
float* x2;
float* x01;
float* chi11;
float* x21;
float alpha11_conj;
float minus_chi11;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_ahead, f_ahead;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(s, scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(s, axpyf_ker_ft) kfp_af;
/* Assign function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_af = bli_saxpyf_zen_int_5;
b_fuse = 5;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_FLOAT, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_AF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_ahead = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A01 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_ahead = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a01 = A11 + (0 )*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
}
/* x01 = x01 - chi11 * a01; */
PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(s, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(s, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
}
/* x0 = x0 - A01 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A01, rs_at, cs_at,
x1, incx,
x0, incx,
cntx
);
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_ahead = m - iter - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_ahead = f - k - 1;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
}
/* x21 = x21 - chi11 * a21; */
PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(s, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(s, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
}
/* x2 = x2 - A21 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A21, rs_at, cs_at,
x1, incx,
x2, incx,
cntx
);
}
}
}
void bli_ztrsv_unf_var2
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
dcomplex* alpha,
dcomplex* a, inc_t rs_a, inc_t cs_a,
dcomplex* x, inc_t incx,
cntx_t* cntx
)
{
dcomplex* minus_one = PASTEMAC(z, m1);
dcomplex* A01;
dcomplex* A11;
dcomplex* A21;
dcomplex* a01;
dcomplex* alpha11;
dcomplex* a21;
dcomplex* x0;
dcomplex* x1;
dcomplex* x2;
dcomplex* x01;
dcomplex* chi11;
dcomplex* x21;
dcomplex alpha11_conj;
dcomplex minus_chi11;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_ahead, f_ahead;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(z, scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(z, axpyf_ker_ft) kfp_af;
/* Assign function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_af = bli_zaxpyf_zen_int_5;
b_fuse = 5;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_AF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_ahead = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A01 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_ahead = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a01 = A11 + (0 )*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
}
/* x01 = x01 - chi11 * a01; */
PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(z, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(z, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
}
/* x0 = x0 - A01 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A01, rs_at, cs_at,
x1, incx,
x0, incx,
cntx
);
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_ahead = m - iter - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_ahead = f - k - 1;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
}
/* x21 = x21 - chi11 * a21; */
PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(z, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(z, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
}
/* x2 = x2 - A21 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A21, rs_at, cs_at,
x1, incx,
x2, incx,
cntx
);
}
}
}
void bli_ctrsv_unf_var2
(
uplo_t uploa,
trans_t transa,
diag_t diaga,
dim_t m,
scomplex* alpha,
scomplex* a, inc_t rs_a, inc_t cs_a,
scomplex* x, inc_t incx,
cntx_t* cntx
)
{
scomplex* minus_one = PASTEMAC(c, m1);
scomplex* A01;
scomplex* A11;
scomplex* A21;
scomplex* a01;
scomplex* alpha11;
scomplex* a21;
scomplex* x0;
scomplex* x1;
scomplex* x2;
scomplex* x01;
scomplex* chi11;
scomplex* x21;
scomplex alpha11_conj;
scomplex minus_chi11;
dim_t iter, i, k, j, l;
dim_t b_fuse, f;
dim_t n_ahead, f_ahead;
inc_t rs_at, cs_at;
uplo_t uploa_trans;
conj_t conja;
/* x = alpha * x; */
PASTEMAC2(c, scalv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
m,
alpha,
x, incx,
cntx,
NULL
);
if( bli_does_notrans( transa ) )
{
rs_at = rs_a;
cs_at = cs_a;
uploa_trans = uploa;
}
else /* if ( bli_does_trans( transa ) ) */
{
rs_at = cs_a;
cs_at = rs_a;
uploa_trans = bli_uplo_toggled( uploa );
}
conja = bli_extract_conj( transa );
PASTECH(c, axpyf_ker_ft) kfp_af;
/* Assign function pointer and fusing factor. */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
kfp_af = bli_caxpyf_zen_int_5;
b_fuse = 5;
}
else
{
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYF_KER, cntx );
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_AF, cntx );
}
/* We reduce all of the possible cases down to just lower/upper. */
if ( bli_is_upper( uploa_trans ) )
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
i = m - iter - f;
n_ahead = i;
A11 = a + (i )*rs_at + (i )*cs_at;
A01 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x0 = x + (0 )*incx;
/* x1 = x1 / triu( A11 ); */
for ( k = 0; k < f; ++k )
{
l = f - k - 1;
f_ahead = l;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a01 = A11 + (0 )*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x01 = x1 + (0 )*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
}
/* x01 = x01 - chi11 * a01; */
PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(c, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(c, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
}
}
/* x0 = x0 - A01 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A01, rs_at, cs_at,
x1, incx,
x0, incx,
cntx
);
}
}
else /* if ( bli_is_lower( uploa_trans ) ) */
{
for ( iter = 0; iter < m; iter += f )
{
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
i = iter;
n_ahead = m - iter - f;
A11 = a + (i )*rs_at + (i )*cs_at;
A21 = a + (i+f)*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
x2 = x + (i+f)*incx;
/* x1 = x1 / tril( A11 ); */
for ( k = 0; k < f; ++k )
{
l = k;
f_ahead = f - k - 1;
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
chi11 = x1 + (l )*incx;
x21 = x1 + (l+1)*incx;
/* chi11 = chi11 / alpha11; */
if ( bli_is_nonunit_diag( diaga ) )
{
PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
}
/* x21 = x21 - chi11 * a21; */
PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
if ( bli_is_conj( conja ) )
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(c, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
else
{
for ( j = 0; j < f_ahead; ++j )
PASTEMAC(c, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
}
}
/* x2 = x2 - A21 * x1; */
kfp_af
(
conja,
BLIS_NO_CONJUGATE,
n_ahead,
f,
minus_one,
A21, rs_at, cs_at,
x1, incx,
x2, incx,
cntx
);
}
}
}
#else
INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
#endif
INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )

File diff suppressed because it is too large Load Diff

View File

@@ -48,120 +48,6 @@ err_t bli_gemmsup_int
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
#ifdef BLIS_CONFIG_EPYC
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
dim_t jc_new;
dim_t ic_new;
//bli_gemmsup_ref_var2
//bli_gemmsup_ref_var1
#if 0
bli_gemmsup_ref_var1n
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
return BLIS_FAILURE;
}
if ( is_rrr_rrc_rcr_crr )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
// - Currently only row-preferential kernels are only supported.
// calculate number of micropanels in m and n dimensions and
// recalculate the automatic thread factorization based on these number of micropanels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
/*Enable packing for B matrix for higher sizes*/
if(bli_is_float(dt) && (n_threads==1)) {
if((m > 240) && (k > 240) && (n > 240))
bli_rntm_set_pack_b( 1, rntm );
}
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
// - Currently only row-preferential kernels are only supported.
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
if ( auto_factor )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
/* Enable packing for B matrix for higher sizes. Note that pack A
* becomes pack B inside var2m because this is transpose case*/
if(bli_is_float(dt) && (n_threads==1)) {
if((m > 240) && (k > 240) && (n > 240))
bli_rntm_set_pack_a( 1, rntm );
}
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return BLIS_SUCCESS;
#else // #ifdef BLIS_CONFIG_EPYC
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
@@ -335,8 +221,6 @@ err_t bli_gemmsup_int
// Return success so that the caller knows that we computed the solution.
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return BLIS_SUCCESS;
#endif
}
// -----------------------------------------------------------------------------
@@ -401,15 +285,9 @@ err_t bli_gemmtsup_int
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
#ifdef BLIS_CONFIG_EPYC
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
#else
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
#endif
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
@@ -472,14 +350,10 @@ err_t bli_gemmtsup_int
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
#ifdef BLIS_CONFIG_EPYC
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
#else
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
#endif
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.

View File

@@ -0,0 +1,352 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
err_t bli_gemmsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
dim_t jc_new;
dim_t ic_new;
//bli_gemmsup_ref_var2
//bli_gemmsup_ref_var1
#if 0
bli_gemmsup_ref_var1n
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
return BLIS_FAILURE;
}
if ( is_rrr_rrc_rcr_crr )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
// - Currently only row-preferential kernels are only supported.
// calculate number of micropanels in m and n dimensions and
// recalculate the automatic thread factorization based on these number of micropanels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
/*Enable packing for B matrix for higher sizes*/
if(bli_is_float(dt) && (n_threads==1)) {
if((m > 240) && (k > 240) && (n > 240))
bli_rntm_set_pack_b( 1, rntm );
}
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
// - Currently only row-preferential kernels are only supported.
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
if ( auto_factor )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
/* Enable packing for B matrix for higher sizes. Note that pack A
* becomes pack B inside var2m because this is transpose case*/
if(bli_is_float(dt) && (n_threads==1)) {
if((m > 240) && (k > 240) && (n > 240))
bli_rntm_set_pack_a( 1, rntm );
}
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return BLIS_SUCCESS;
}
// -----------------------------------------------------------------------------
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
// AOCL_DTL_LOG_GEMMT_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
return BLIS_FAILURE;
}
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = m;
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of nc up to be a multiple of mr.
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of mc up to be a multiple of nr.
}
}
// Return success so that the caller knows that we computed the solution.
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return BLIS_SUCCESS;
}

View File

@@ -177,19 +177,6 @@ void bli_gemm_front
dim_t m_dim_local = bli_obj_length( &c_local );
dim_t n_dim_local = bli_obj_width( &c_local );
dim_t k_dim_local = bli_obj_width( &a_local );
#ifdef BLIS_CONFIG_EPYC
// Regression observed in sgemm native path in cases where m >= 4 * n
// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit
// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
// the issue.
if( bli_obj_is_float( &c_local ) &&
( n_dim_local >= 1024 ) &&
( k_dim_local >= 1024 ) &&
( m_dim_local >= ( 4 * n_dim_local ) ) )
{
m_dim_local *= 2;
}
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any

View File

@@ -0,0 +1,413 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
#ifdef BLIS_ENABLE_SMALL_MATRIX
// Only handle small problems separately for homogeneous datatypes.
if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
{
err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
if ( status == BLIS_SUCCESS )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
}
#endif
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
// If any of the storage datatypes differ, or if the computation precision
// differs from the storage precision of C, utilize the mixed datatype
// code path.
// NOTE: If we ever want to support the caller setting the computation
// domain explicitly, we will need to check the computation dt against the
// storage dt of C (instead of the computation precision against the
// storage precision of C).
if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
{
// Handle mixed datatype cases in bli_gemm_md(), which may modify
// the objects or the context. (If the context is modified, cntx
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
}
//else // homogeneous datatypes
#endif
// Load the pack schemas from the context and embed them into the objects
// for A and B. (Native contexts are initialized with the correct pack
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.
// Attach alpha to B, and in the process typecast alpha to the target
// datatype of the matrix (which in this case is equal to the computation
// datatype).
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
// Attach beta to C, and in the process typecast beta to the target
// datatype of the matrix (which in this case is equal to the storage
// datatype of C).
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local );
// Change the alpha and beta pointers to BLIS_ONE since the values have
// now been typecast and attached to the matrices above.
alpha = &BLIS_ONE;
beta = &BLIS_ONE;
#ifdef BLIS_ENABLE_GEMM_MD
// Don't perform the following optimization for ccr or crc cases, as
// those cases are sensitive to the ukernel storage preference (ie:
// transposing the operation would break them).
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
#endif
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
// We must also swap the pack schemas, which were set by bli_gemm_md()
// or the inlined code above.
bli_obj_swap_pack_schemas( &a_local, &b_local );
}
dim_t m_dim_local = bli_obj_length( &c_local );
dim_t n_dim_local = bli_obj_width( &c_local );
dim_t k_dim_local = bli_obj_width( &a_local );
// Regression observed in sgemm native path in cases where m >= 4 * n
// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit
// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
// the issue.
if( bli_obj_is_float( &c_local ) &&
( n_dim_local >= 1024 ) &&
( k_dim_local >= 1024 ) &&
( m_dim_local >= ( 4 * n_dim_local ) ) )
{
m_dim_local *= 2;
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
m_dim_local,
n_dim_local,
k_dim_local,
rntm
);
obj_t* cp = &c_local;
obj_t* betap = beta;
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If any of the following conditions are met, create a temporary matrix
// conformal to C into which we will accumulate the matrix product:
// - the storage precision of C differs from the computation precision;
// - the domains are mixed as crr;
// - the storage format of C does not match the preferred orientation
// of the ccr or crc cases.
// Then, after the computation is complete, this matrix will be copied
// or accumulated back to C.
const bool is_ccr_mismatch =
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_obj_is_col_stored( &c_local ) );
const bool is_crc_mismatch =
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
!bli_obj_is_row_stored( &c_local ) );
obj_t ct;
bool use_ct = FALSE;
// FGVZ: Consider adding another guard here that only creates and uses a
// temporary matrix for accumulation if k < c * kc, where c is some small
// constant like 2. And don't forget to use the same conditional for the
// castm() and free() at the end.
if (
bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
is_ccr_mismatch ||
is_crc_mismatch
)
{
use_ct = TRUE;
}
// If we need a temporary matrix conformal to C for whatever reason,
// we create it and prepare to use it now.
if ( use_ct )
{
const dim_t m = bli_obj_length( &c_local );
const dim_t n = bli_obj_width( &c_local );
inc_t rs = bli_obj_row_stride( &c_local );
inc_t cs = bli_obj_col_stride( &c_local );
num_t dt_ct = bli_obj_domain( &c_local ) |
bli_obj_comp_prec( &c_local );
// When performing the crr case, accumulate to a contiguously-stored
// real matrix so we do not have to repeatedly update C with general
// stride.
if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
// When performing the mismatched ccr or crc cases, now is the time
// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
// microkernel can output directly to C (instead of using a temporary
// microtile).
if ( is_ccr_mismatch ) { rs = 1; cs = m; }
else if ( is_crc_mismatch ) { rs = n; cs = 1; }
bli_obj_create( dt_ct, m, n, rs, cs, &ct );
const num_t dt_exec = bli_obj_exec_dt( &c_local );
const num_t dt_comp = bli_obj_comp_dt( &c_local );
bli_obj_set_target_dt( dt_ct, &ct );
bli_obj_set_exec_dt( dt_exec, &ct );
bli_obj_set_comp_dt( dt_comp, &ct );
// A naive approach would cast C to the comptuation datatype,
// compute with beta, and then cast the result back to the
// user-provided output matrix. However, we employ a different
// approach that halves the number of memops on C (or its
// typecast temporary) by writing the A*B product directly to
// temporary storage, and then using xpbym to scale the
// output matrix by beta and accumulate/cast the A*B product.
//bli_castm( &c_local, &ct );
betap = &BLIS_ZERO;
cp = &ct;
}
#endif
#endif
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
betap,
cp,
cntx,
rntm,
cntl
);
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If we created a temporary matrix conformal to C for whatever reason,
// we copy/accumulate the result back to C and then release the object.
if ( use_ct )
{
obj_t beta_local;
bli_obj_scalar_detach( &c_local, &beta_local );
//bli_castnzm( &ct, &c_local );
bli_xpbym( &ct, &beta_local, &c_local );
bli_obj_free( &ct );
}
#endif
#endif
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
// -----------------------------------------------------------------------------
#if 0
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
{
const bool a_is_real = bli_obj_is_real( a );
const bool a_is_comp = bli_obj_is_complex( a );
const bool b_is_real = bli_obj_is_real( b );
const bool b_is_comp = bli_obj_is_complex( b );
const bool c_is_real = bli_obj_is_real( c );
const bool c_is_comp = bli_obj_is_complex( c );
const bool a_is_single = bli_obj_is_single_prec( a );
const bool a_is_double = bli_obj_is_double_prec( a );
const bool b_is_single = bli_obj_is_single_prec( b );
const bool b_is_double = bli_obj_is_double_prec( b );
const bool c_is_single = bli_obj_is_single_prec( c );
const bool c_is_double = bli_obj_is_double_prec( c );
const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
bli_obj_domain( c ) != bli_obj_domain( b );
( void )a_is_real; ( void )a_is_comp;
( void )b_is_real; ( void )b_is_comp;
( void )c_is_real; ( void )c_is_comp;
( void )a_is_single; ( void )a_is_double;
( void )b_is_single; ( void )b_is_double;
( void )c_is_single; ( void )c_is_double;
( void )comp_single; ( void )comp_double;
if (
//( c_is_comp && a_is_comp && b_is_real ) ||
//( c_is_comp && a_is_real && b_is_comp ) ||
//( c_is_real && a_is_comp && b_is_comp ) ||
//( c_is_comp && a_is_real && b_is_real ) ||
//( c_is_real && a_is_comp && b_is_real ) ||
//( c_is_real && a_is_real && b_is_comp ) ||
//FALSE
TRUE
)
{
if (
( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
( c_is_single && a_is_single && b_is_single && comp_single ) ||
( c_is_single && a_is_single && b_is_single && comp_double ) ||
( c_is_single && a_is_single && b_is_double ) ||
( c_is_single && a_is_double && b_is_single ) ||
( c_is_double && a_is_single && b_is_single ) ||
( c_is_single && a_is_double && b_is_double ) ||
( c_is_double && a_is_single && b_is_double ) ||
( c_is_double && a_is_double && b_is_single ) ||
( c_is_double && a_is_double && b_is_double && comp_single ) ||
( c_is_double && a_is_double && b_is_double && comp_double ) ||
( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
FALSE
)
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
else
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
}
else
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
return;
}
#else
#if 0
// If any of the storage datatypes differ, or if the execution precision
// differs from the storage precision of C, utilize the mixed datatype
// code path.
// NOTE: We could check the exec dt against the storage dt of C, but for
// now we don't support the caller setting the execution domain
// explicitly.
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
{
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
return;
}
#endif
#endif

View File

@@ -501,6 +501,25 @@ bool bli_cpuid_is_bulldozer
return TRUE;
}
bool bli_cpuid_is_avx_supported( void )
{
uint32_t family, model, features;
// Call the CPUID instruction and parse its results into a family id,
// model id, and a feature bit field. The return value encodes the
// vendor.
bli_cpuid_query( &family, &model, &features );
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
arch_t bli_cpuid_query_id( void )

View File

@@ -133,7 +133,7 @@ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want )
void get_cpu_name( char *cpu_name );
int vpu_count( void );
bool bli_cpuid_is_avx_supported(void);
enum
{
@@ -160,6 +160,8 @@ enum
FEATURE_AVX512VL = 0x4000
};
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath );

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -98,217 +98,5 @@ f77_int PASTEF772(i,chx,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
f77_int isamax_
(
const f77_int* n,
const float* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
dim_t n0;
float* x0;
inc_t incx0;
gint_t bli_index;
f77_int f77_index;
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */
if ( *n < 1 || *incx <= 0 ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
return 0;
}
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel */
bli_samaxv_zen_int
(
n0,
x0, incx0,
&bli_index,
NULL
);
}
else
{
PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
(
n0,
x0, incx0,
&bli_index,
NULL,
NULL
);
}
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */
f77_index = bli_index + 1;
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return f77_index;
}
f77_int idamax_
(
const f77_int* n,
const double* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
dim_t n0;
double* x0;
inc_t incx0;
gint_t bli_index;
f77_int f77_index;
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */
if ( *n < 1 || *incx <= 0 ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
return 0;
}
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel */
bli_damaxv_zen_int
(
n0,
x0, incx0,
&bli_index,
NULL
);
}
else
{
PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
(
n0,
x0, incx0,
&bli_index,
NULL,
NULL
);
}
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */
f77_index = bli_index + 1;
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return f77_index;
}
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
#else
INSERT_GENTFUNC_BLAS( amax, amaxv )
#endif
#endif

295
frame/compat/bla_amax_amd.c Normal file
View File

@@ -0,0 +1,295 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype_x, chx, blasname, blisname ) \
\
f77_int PASTEF772(i,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(chx), *n, *incx) \
\
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
gint_t bli_index; \
f77_int f77_index; \
\
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */ \
if ( *n < 1 || *incx <= 0 ) { \
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "iamax_: vector empty") \
return 0; \
}\
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
&bli_index, \
NULL, \
NULL \
); \
\
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */ \
f77_index = bli_index + 1; \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
return f77_index; \
}
#ifdef BLIS_ENABLE_BLAS
f77_int isamax_
(
const f77_int* n,
const float* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
dim_t n0;
float* x0;
inc_t incx0;
gint_t bli_index;
f77_int f77_index;
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */
if ( *n < 1 || *incx <= 0 ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
return 0;
}
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel */
bli_samaxv_zen_int
(
n0,
x0, incx0,
&bli_index,
NULL
);
}
else
{
PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
(
n0,
x0, incx0,
&bli_index,
NULL,
NULL
);
}
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */
f77_index = bli_index + 1;
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return f77_index;
}
f77_int idamax_
(
const f77_int* n,
const double* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
dim_t n0;
double* x0;
inc_t incx0;
gint_t bli_index;
f77_int f77_index;
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */
if ( *n < 1 || *incx <= 0 ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
return 0;
}
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel */
bli_damaxv_zen_int
(
n0,
x0, incx0,
&bli_index,
NULL
);
}
else
{
PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
(
n0,
x0, incx0,
&bli_index,
NULL,
NULL
);
}
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */
f77_index = bli_index + 1;
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return f77_index;
}
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -87,411 +87,6 @@ void PASTEF77(ch,blasname) \
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void saxpy_
(
const f77_int* n,
const float* alpha,
const float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
bli_saxpyv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(float*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(float*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void daxpy_
(
const f77_int* n,
const double* alpha,
const double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
bli_daxpyv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(double*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(double*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
void caxpy_
(
const f77_int* n,
const scomplex* alpha,
const scomplex* x, const f77_int* incx,
scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
bli_caxpyv_zen_int5
(
BLIS_NO_CONJUGATE,
n0,
(scomplex*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(scomplex*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
void zaxpy_
(
const f77_int* n,
const dcomplex* alpha,
const dcomplex* x, const f77_int* incx,
dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
bli_zaxpyv_zen_int5
(
BLIS_NO_CONJUGATE,
n0,
(dcomplex*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(dcomplex*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
#else
INSERT_GENTFUNC_BLAS( axpy, axpyv )
#endif
#endif

462
frame/compat/bla_axpy_amd.c Normal file
View File

@@ -0,0 +1,462 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, (void*)alpha, *incx, *incy) \
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
(ftype*)alpha, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
void saxpy_
(
const f77_int* n,
const float* alpha,
const float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_saxpyv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(float*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(float*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void daxpy_
(
const f77_int* n,
const double* alpha,
const double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_daxpyv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(double*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(double*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
void caxpy_
(
const f77_int* n,
const scomplex* alpha,
const scomplex* x, const f77_int* incx,
scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_caxpyv_zen_int5
(
BLIS_NO_CONJUGATE,
n0,
(scomplex*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(scomplex*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
void zaxpy_
(
const f77_int* n,
const dcomplex* alpha,
const dcomplex* x, const f77_int* incx,
dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_zaxpyv_zen_int5
(
BLIS_NO_CONJUGATE,
n0,
(dcomplex*)alpha,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
(dcomplex*)alpha,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// bli_finalize_auto();
}
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -88,217 +88,5 @@ void PASTEF77(ch,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void scopy_
(
const f77_int* n,
const float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (float*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (float*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// bli_finalize_auto();
}
void dcopy_
(
const f77_int* n,
const double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (double*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (double*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
#else
INSERT_GENTFUNC_BLAS(copy, copyv)
#endif
#endif

285
frame/compat/bla_copy_amd.c Normal file
View File

@@ -0,0 +1,285 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy) \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
(\
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
void scopy_
(
const f77_int* n,
const float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (float*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (float*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// bli_finalize_auto();
}
void dcopy_
(
const f77_int* n,
const double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (double*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (double*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else
{
PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -90,681 +90,11 @@ ftype PASTEF772(ch,blasname,chc) \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
float sdot_
(
const f77_int* n,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
float rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_sdotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
double ddot_
(
const f77_int* n,
const double* x, const f77_int* incx,
const double* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
double rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_ddotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
#else
INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
#endif
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
#ifdef BLIS_CONFIG_EPYC
scomplex cdotu_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
dcomplex zdotu_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
scomplex cdotc_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
dcomplex zdotc_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen)
{
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
#else
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
#endif
#else
// For the "intel" complex return type, use a hidden parameter to return the result
#undef GENTFUNCDOT
@@ -819,8 +149,8 @@ void PASTEF772(ch,blasname,chc) \
}
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
#endif
#endif
#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
#endif // BLIS_ENABLE_BLAS
// -- "Black sheep" dot product function definitions --
@@ -894,4 +224,4 @@ double PASTEF77(d,sdot)
return rho;
}
#endif
#endif // BLIS_ENABLE_BLAS

841
frame/compat/bla_dot_amd.c Normal file
View File

@@ -0,0 +1,841 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCDOT
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
\
ftype PASTEF772(ch,blasname,chc) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return rho; \
}
#ifdef BLIS_ENABLE_BLAS
float sdot_
(
const f77_int* n,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
float rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((float*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_sdotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
double ddot_
(
const f77_int* n,
const double* x, const f77_int* incx,
const double* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
double rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((double*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_ddotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
scomplex cdotu_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
dcomplex zdotu_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
scomplex cdotc_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
dcomplex zdotc_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
#else // BLIS_DISABLE_COMPLEX_RETURN_INTEL
// For the "intel" complex return type, use a hidden parameter to return the result
#undef GENTFUNCDOT
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
\
void PASTEF772(ch,blasname,chc) \
( \
ftype* rhop, \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
bli_finalize_auto(); \
\
*rhop = rho; \
}
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
// -- "Black sheep" dot product function definitions --
// Input vectors stored in single precision, computed in double precision,
// with result returned in single precision.
float PASTEF77(sd,sdot)
(
const f77_int* n,
const float* sb,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
return ( float )
(
( double )(*sb) +
PASTEF77(d,sdot)
(
n,
x, incx,
y, incy
)
);
}
// Input vectors stored in single precision, computed in double precision,
// with result returned in double precision.
double PASTEF77(d,sdot)
(
const f77_int* n,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
double rho;
dim_t i;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
/* Initialization of BLIS is not required. */
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
rho = 0.0;
for ( i = 0; i < n0; i++ )
{
float* chi1 = x0 + (i )*incx0;
float* psi1 = y0 + (i )*incy0;
bli_ddots( (( double )(*chi1)),
(( double )(*psi1)), rho );
}
/* Finalization of BLIS is not required, because initialization was
not required. */
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return rho;
}
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -300,512 +300,7 @@ void PASTEF77(ch,blasname) \
#endif
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void dgemm_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const double* alpha,
const double* a, const f77_int* lda,
const double* b, const f77_int* ldb,
const double* beta,
double* c, const f77_int* ldc
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
(
MKSTR(d),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
lda,
ldb,
ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1(*m, m0);
bli_convert_blas_dim1(*n, n0);
bli_convert_blas_dim1(*k, k0);
/* Set the row and column strides of the matrix operands. */
const inc_t rs_a = 1;
const inc_t cs_a = *lda;
const inc_t rs_b = 1;
const inc_t cs_b = *ldb;
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (!bamdzen)
{
// This code is duplicated below, however we don't want to move it out of
// this IF block as it will affect the performance on Zen architetures
// Also this is temporary fix which will be replaced later.
const num_t dt = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
bli_obj_set_conjtrans(blis_transa, &ao);
bli_obj_set_conjtrans(blis_transb, &bo);
// Will call parallelized dgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
{
bli_dgemm_ref_k1_nn( m0, n0, k0,
(double*)alpha,
(double*)a, *lda,
(double*)b, *ldb,
(double*)beta,
c, *ldc
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS */
bli_finalize_auto();
return;
}
if (n0 == 1)
{
if (bli_is_notrans(blis_transa))
{
bli_dgemv_unf_var2(
BLIS_NO_TRANSPOSE,
bli_extract_conj(blis_transb),
m0, k0,
(double*)alpha,
(double*)a, rs_a, cs_a,
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
(double*)beta,
c, rs_c,
((void*)0)
);
}
else
{
bli_dgemv_unf_var1(
blis_transa,
bli_extract_conj(blis_transb),
k0, m0,
(double*)alpha,
(double*)a, rs_a, cs_a,
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
(double*)beta,
c, rs_c,
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
else if (m0 == 1)
{
if (bli_is_notrans(blis_transb))
{
bli_dgemv_unf_var1(
blis_transb,
bli_extract_conj(blis_transa),
n0, k0,
(double*)alpha,
(double*)b, cs_b, rs_b,
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
(double*)beta,
c, cs_c,
((void*)0)
);
}
else
{
bli_dgemv_unf_var2(
blis_transb,
bli_extract_conj(blis_transa),
k0, n0,
(double*)alpha,
(double*)b, cs_b, rs_b,
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
(double*)beta,
c, cs_c,
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
const num_t dt = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
bli_obj_set_conjtrans(blis_transa, &ao);
bli_obj_set_conjtrans(blis_transb, &bo);
//cntx_t* cntx = bli_gks_query_cntx();
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
// if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
//
#ifdef AOCL_DYNAMIC
if (nt && ((n0 > 10 ) || (k0 > 10)) )
#else
if (nt)
#endif
{
// Will call parallelized dgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
// The code below will be called when number of threads = 1.
#ifdef BLIS_ENABLE_SMALL_MATRIX
//if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
((n0 <= 10) && (k0 <=10)) )
{
err_t status;
if (bli_is_notrans(blis_transa))
{
status = bli_dgemm_small( &alphao,
&ao,
&bo,
&betao,
&co,
NULL, //cntx,
NULL
);
}
else
{
status = bli_dgemm_small_At ( &alphao,
&ao,
&bo,
&betao,
&co,
NULL, //cntx,
NULL
);
}
if (status == BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
}
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if (status == BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
// fall back on native path when dgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
/* ( */
/* &alphao, */
/* &ao, */
/* &bo, */
/* &betao, */
/* &co, */
/* NULL, */
/* NULL */
/* ); */
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
} // end of dgemm_
void zgemm_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const dcomplex* alpha,
const dcomplex* a, const f77_int* lda,
const dcomplex* b, const f77_int* ldb,
const dcomplex* beta,
dcomplex* c, const f77_int* ldc
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
(
MKSTR(z),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
lda,
ldb,
ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1( *m, m0 );
bli_convert_blas_dim1( *n, n0 );
bli_convert_blas_dim1( *k, k0 );
/* Set the row and column strides of the matrix operands. */
const inc_t rs_a = 1;
const inc_t cs_a = *lda;
const inc_t rs_b = 1;
const inc_t cs_b = *ldb;
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
const num_t dt = BLIS_DCOMPLEX;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
// default instance peformance tuning is done in zgemm.
// Single instance tuning is done based on env set.
dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
if ( nt )
{
// Will call parallelized zgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
// The code below will be called when number of threads = 1.
#if ENABLE_INDUCED_METHOD
/* 3m_sqp is optimal for certain matrix shapes.
Initial study that it works well for square sizes and sizes closer to square shape.
* Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
* Further investigation is necessary to make the usage choices more generic. */
bool sqp_on = false;
if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
{
sqp_on = true;
}
// current range of sizes used for 3m_sqp to be expaned after evaluation.
if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
&& ( k0 == 1120 ) ) //to be tuned further.
{
sqp_on = true;
}
if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
{
//sqp algo is found better for n > 40
if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
#endif//ENABLE_INDUCED_METHOD
// native tuning resulted in better numbers compared to sup in constrained multi-instance
// sup has been enabled for single instance cases.
if(single_instance==1)
{
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if(status==BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
bli_finalize_auto();
}// end of zgemm_
INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
#else
INSERT_GENTFUNC_BLAS( gemm,gemm )
#endif
// Observed a regression in dgemm with this function addition.
// Disabling temporarily.

894
frame/compat/bla_gemm_amd.c Normal file
View File

@@ -0,0 +1,894 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#define ENABLE_INDUCED_METHOD 0
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
inc_t rs_a, cs_a; \
inc_t rs_b, cs_b; \
inc_t rs_c, cs_c; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
rs_a = 1; \
cs_a = *lda; \
rs_b = 1; \
cs_b = *ldb; \
rs_c = 1; \
cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
blis_transb, \
m0, \
n0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
\
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
if( n0 == 1 ) \
{ \
if(bli_is_notrans(blis_transa)) \
{ \
PASTEMAC(ch,gemv_unf_var2)( \
BLIS_NO_TRANSPOSE, \
bli_extract_conj(blis_transb), \
m0, k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a,\
(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
(ftype*) beta, \
c, rs_c, \
NULL \
); \
} \
else \
{ \
PASTEMAC(ch,gemv_unf_var1)( \
blis_transa, \
bli_extract_conj(blis_transb), \
k0, m0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
(ftype*)beta, \
c, rs_c, \
NULL \
); \
} \
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
return; \
} \
else if( m0 == 1 ) \
{ \
if(bli_is_notrans(blis_transb)) \
{ \
PASTEMAC(ch,gemv_unf_var1)( \
blis_transb, \
bli_extract_conj(blis_transa), \
n0, k0, \
(ftype*)alpha, \
(ftype*)b, cs_b, rs_b, \
(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
(ftype*)beta, \
c, cs_c, \
NULL \
); \
} \
else \
{ \
PASTEMAC(ch,gemv_unf_var2)( \
blis_transb, \
bli_extract_conj(blis_transa), \
k0, n0, \
(ftype*)alpha, \
(ftype*)b, cs_b, rs_b, \
(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
(ftype*)beta, \
c, cs_c, \
NULL \
); \
} \
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( blis_transb, &bo ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
void dgemm_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const double* alpha,
const double* a, const f77_int* lda,
const double* b, const f77_int* ldb,
const double* beta,
double* c, const f77_int* ldc
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
(
MKSTR(d),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
lda,
ldb,
ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1(*m, m0);
bli_convert_blas_dim1(*n, n0);
bli_convert_blas_dim1(*k, k0);
/* Set the row and column strides of the matrix operands. */
const inc_t rs_a = 1;
const inc_t cs_a = *lda;
const inc_t rs_b = 1;
const inc_t cs_b = *ldb;
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
// This code is duplicated below, however we don't want to move it out of
// this IF block as it will affect the performance on Zen architetures
// Also this is temporary fix which will be replaced later.
const num_t dt = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
bli_obj_set_conjtrans(blis_transa, &ao);
bli_obj_set_conjtrans(blis_transb, &bo);
// Will call parallelized dgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
{
bli_dgemm_ref_k1_nn( m0, n0, k0,
(double*)alpha,
(double*)a, *lda,
(double*)b, *ldb,
(double*)beta,
c, *ldc
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS */
bli_finalize_auto();
return;
}
if (n0 == 1)
{
if (bli_is_notrans(blis_transa))
{
bli_dgemv_unf_var2(
BLIS_NO_TRANSPOSE,
bli_extract_conj(blis_transb),
m0, k0,
(double*)alpha,
(double*)a, rs_a, cs_a,
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
(double*)beta,
c, rs_c,
((void*)0)
);
}
else
{
bli_dgemv_unf_var1(
blis_transa,
bli_extract_conj(blis_transb),
k0, m0,
(double*)alpha,
(double*)a, rs_a, cs_a,
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
(double*)beta,
c, rs_c,
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
else if (m0 == 1)
{
if (bli_is_notrans(blis_transb))
{
bli_dgemv_unf_var1(
blis_transb,
bli_extract_conj(blis_transa),
n0, k0,
(double*)alpha,
(double*)b, cs_b, rs_b,
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
(double*)beta,
c, cs_c,
((void*)0)
);
}
else
{
bli_dgemv_unf_var2(
blis_transb,
bli_extract_conj(blis_transa),
k0, n0,
(double*)alpha,
(double*)b, cs_b, rs_b,
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
(double*)beta,
c, cs_c,
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
const num_t dt = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
bli_obj_set_conjtrans(blis_transa, &ao);
bli_obj_set_conjtrans(blis_transb, &bo);
//cntx_t* cntx = bli_gks_query_cntx();
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
// if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
//
#ifdef AOCL_DYNAMIC
if (nt && ((n0 > 10 ) || (k0 > 10)) )
#else
if (nt)
#endif
{
// Will call parallelized dgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
// The code below will be called when number of threads = 1.
#ifdef BLIS_ENABLE_SMALL_MATRIX
//if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
((n0 <= 10) && (k0 <=10)) )
{
err_t status;
if (bli_is_notrans(blis_transa))
{
status = bli_dgemm_small( &alphao,
&ao,
&bo,
&betao,
&co,
NULL, //cntx,
NULL
);
}
else
{
status = bli_dgemm_small_At ( &alphao,
&ao,
&bo,
&betao,
&co,
NULL, //cntx,
NULL
);
}
if (status == BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
}
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if (status == BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
// fall back on native path when dgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
/* ( */
/* &alphao, */
/* &ao, */
/* &bo, */
/* &betao, */
/* &co, */
/* NULL, */
/* NULL */
/* ); */
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
} // end of dgemm_
void zgemm_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const dcomplex* alpha,
const dcomplex* a, const f77_int* lda,
const dcomplex* b, const f77_int* ldb,
const dcomplex* beta,
dcomplex* c, const f77_int* ldc
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
(
MKSTR(z),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
lda,
ldb,
ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1( *m, m0 );
bli_convert_blas_dim1( *n, n0 );
bli_convert_blas_dim1( *k, k0 );
/* Set the row and column strides of the matrix operands. */
const inc_t rs_a = 1;
const inc_t cs_a = *lda;
const inc_t rs_b = 1;
const inc_t cs_b = *ldb;
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
const num_t dt = BLIS_DCOMPLEX;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
// default instance peformance tuning is done in zgemm.
// Single instance tuning is done based on env set.
dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
if ( nt )
{
// Will call parallelized zgemm code - sup & native
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
(
&alphao,
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
// The code below will be called when number of threads = 1.
#if ENABLE_INDUCED_METHOD
/* 3m_sqp is optimal for certain matrix shapes.
Initial study that it works well for square sizes and sizes closer to square shape.
* Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
* Further investigation is necessary to make the usage choices more generic. */
bool sqp_on = false;
if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
{
sqp_on = true;
}
// current range of sizes used for 3m_sqp to be expaned after evaluation.
if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
&& ( k0 == 1120 ) ) //to be tuned further.
{
sqp_on = true;
}
if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
{
//sqp algo is found better for n > 40
if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
#endif//ENABLE_INDUCED_METHOD
// native tuning resulted in better numbers compared to sup in constrained multi-instance
// sup has been enabled for single instance cases.
if(single_instance==1)
{
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if(status==BLIS_SUCCESS)
{
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
bli_finalize_auto();
}// end of zgemm_
INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
// Observed a regression in dgemm with this function addition.
// Disabling temporarily.
#if 0
void dzgemm_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const dcomplex* alpha,
const double* a, const f77_int* lda,
const dcomplex* b, const f77_int* ldb,
const dcomplex* beta,
dcomplex* c, const f77_int* ldc
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
(
MKSTR(z),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
lda,
ldb,
ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1( *m, m0 );
bli_convert_blas_dim1( *n, n0 );
bli_convert_blas_dim1( *k, k0 );
/* Set the row and column strides of the matrix operands. */
const inc_t rs_a = 1;
const inc_t cs_a = *lda;
const inc_t rs_b = 1;
const inc_t cs_b = *ldb;
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
const num_t dt = BLIS_DCOMPLEX;
const num_t dt_a = BLIS_DOUBLE;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
bli_obj_init_finish( dt_a, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
bli_finalize_auto();
}// end of dzgemm_
#endif
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -147,856 +147,5 @@ void PASTEF77(ch,blasname) \
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void dgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const double* alpha,
const double* a, const f77_int* lda,
const double* x, const f77_int* incx,
const double* beta,
double* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(d),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) )
{
m_y = m0;
n_x = n0;
}
else
{
m_y = n0;
n_x = m0;
}
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((double*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
/* Call BLIS interface. */
PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
//variant_2 is chosen for column-storage
// and uses axpyf-based implementation
bli_dgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
else
{
//var_1 is chosen for row-storage
//and uses dotxf-based implementation
bli_dgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void sgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const float* alpha,
const float* a, const f77_int* lda,
const float* x, const f77_int* incx,
const float* beta,
float* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(s),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) )
{
m_y = m0;
n_x = n0;
}
else
{
m_y = n0;
n_x = m0;
}
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((float*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen == 0)
{
/* Call BLIS interface. */
PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
bli_sgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
else
{
bli_sgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void cgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const scomplex* alpha,
const scomplex* a, const f77_int* lda,
const scomplex* x, const f77_int* incx,
const scomplex* beta,
scomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(c),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
// bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if( *m < 0 ) m0 = (dim_t)0;
else m0 = (dim_t)(*m);
if( *n < 0 ) n0 = (dim_t)0;
else n0 = (dim_t)(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if( *incx < 0 )
{
x0 = ((scomplex*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = (inc_t)(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if( m_y == 1 )
{
conj_t conja = bli_extract_conj(blis_transa);
scomplex rho;
if (bamdzen)
{
bli_cdotv_zen_int5
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL,
NULL
);
}
scomplex yval = *y0;
if(!bli_ceq0(*beta))
{
bli_cscals( *beta, yval );
}
else
{
bli_csetsc( 0.0, 0.0, &yval);
}
if(!bli_ceq0(*alpha))
{
bli_caxpys( *alpha, rho, yval);
}
y0->real = yval.real;
y0->imag = yval.imag;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
if (bamdzen == 0)
{
/* Call BLIS interface. */
PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* call variants based on transpose value */
if( bli_does_notrans( blis_transa ) )
{
bli_cgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL
);
}
else
{
bli_cgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void zgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const dcomplex* alpha,
const dcomplex* a, const f77_int* lda,
const dcomplex* x, const f77_int* incx,
const dcomplex* beta,
dcomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(z),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
// bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if( *m < 0 ) m0 = (dim_t)0;
else m0 = (dim_t)(*m);
if( *n < 0 ) n0 = (dim_t)0;
else n0 = (dim_t)(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if( *incx < 0 )
{
x0 = ((dcomplex*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = (inc_t)(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// When dynamic dispatch is enabled i.e. library is built for amdzen configuration.
// This function is invoked on all architectures including generic.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if( m_y == 1 )
{
conj_t conja = bli_extract_conj(blis_transa);
dcomplex rho;
if (bamdzen)
{
bli_zdotv_zen_int5
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL,
NULL
);
}
dcomplex yval = *y0;
if(!bli_zeq0(*beta))
{
bli_zscals( *beta, yval );
}
else
{
bli_zsetsc( 0.0, 0.0, &yval);
}
if(!bli_zeq0(*alpha))
{
bli_zaxpys( *alpha, rho, yval);
}
y0->real = yval.real;
y0->imag = yval.imag;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
if (bamdzen == 0)
{
/* Call BLIS interface. */
PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* call variants based on transpose value */
if( bli_does_notrans( blis_transa ) )
{
bli_zgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL
);
}
else
{
bli_zgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
#else
INSERT_GENTFUNC_BLAS( gemv, gemv )
#endif
#endif

963
frame/compat/bla_gemv_amd.c Normal file
View File

@@ -0,0 +1,963 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
inc_t rs_a, cs_a; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
m, \
n, \
lda, \
incx, \
incy \
); \
\
if (*m == 0 || *n == 0) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
return; \
} \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
\
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */ \
if ( m_y > 0 && n_x == 0 ) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
rs_a = 1; \
cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
BLIS_NO_CONJUGATE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
void dgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const double* alpha,
const double* a, const f77_int* lda,
const double* x, const f77_int* incx,
const double* beta,
double* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(d),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) )
{
m_y = m0;
n_x = n0;
}
else
{
m_y = n0;
n_x = m0;
}
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((double*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
/* Call BLIS interface. */
PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
//variant_2 is chosen for column-storage
// and uses axpyf-based implementation
bli_dgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
else
{
//var_1 is chosen for row-storage
//and uses dotxf-based implementation
bli_dgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void sgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const float* alpha,
const float* a, const f77_int* lda,
const float* x, const f77_int* incx,
const float* beta,
float* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(s),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) )
{
m_y = m0;
n_x = n0;
}
else
{
m_y = n0;
n_x = m0;
}
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((float*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
/* Call BLIS interface. */
PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
bli_sgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
else
{
bli_sgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void cgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const scomplex* alpha,
const scomplex* a, const f77_int* lda,
const scomplex* x, const f77_int* incx,
const scomplex* beta,
scomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(c),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
// bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if( *m < 0 ) m0 = (dim_t)0;
else m0 = (dim_t)(*m);
if( *n < 0 ) n0 = (dim_t)0;
else n0 = (dim_t)(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if( *incx < 0 )
{
x0 = ((scomplex*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = (inc_t)(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
if( m_y == 1 )
{
conj_t conja = bli_extract_conj(blis_transa);
scomplex rho;
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_cdotv_zen_int5
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL,
NULL
);
}
scomplex yval = *y0;
if(!bli_ceq0(*beta))
{
bli_cscals( *beta, yval );
}
else
{
bli_csetsc( 0.0, 0.0, &yval);
}
if(!bli_ceq0(*alpha))
{
bli_caxpys( *alpha, rho, yval);
}
y0->real = yval.real;
y0->imag = yval.imag;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
if (bli_cpuid_is_avx_supported() == FALSE)
{
/* Call BLIS interface. */
PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* call variants based on transpose value */
if( bli_does_notrans( blis_transa ) )
{
bli_cgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL
);
}
else
{
bli_cgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(scomplex*)alpha,
(scomplex*)a, rs_a, cs_a,
x0, incx0,
(scomplex*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
void zgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const dcomplex* alpha,
const dcomplex* a, const f77_int* lda,
const dcomplex* x, const f77_int* incx,
const dcomplex* beta,
dcomplex* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(z),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
if (*m == 0 || *n == 0)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
// bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if( *m < 0 ) m0 = (dim_t)0;
else m0 = (dim_t)(*m);
if( *n < 0 ) n0 = (dim_t)0;
else n0 = (dim_t)(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if( *incx < 0 )
{
x0 = ((dcomplex*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = (inc_t)(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
if( m_y == 1 )
{
conj_t conja = bli_extract_conj(blis_transa);
dcomplex rho;
if (bli_cpuid_is_avx_supported() == TRUE)
{
bli_zdotv_zen_int5
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL
);
}
else
{
/* Call BLIS interface. */
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
(
conja,
BLIS_NO_CONJUGATE,
n_x,
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
x0, incx0,
&rho,
NULL,
NULL
);
}
dcomplex yval = *y0;
if(!bli_zeq0(*beta))
{
bli_zscals( *beta, yval );
}
else
{
bli_zsetsc( 0.0, 0.0, &yval);
}
if(!bli_zeq0(*alpha))
{
bli_zaxpys( *alpha, rho, yval);
}
y0->real = yval.real;
y0->imag = yval.imag;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
if (bli_cpuid_is_avx_supported() == FALSE)
{
/* Call BLIS interface. */
PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL,
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* call variants based on transpose value */
if( bli_does_notrans( blis_transa ) )
{
bli_zgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL
);
}
else
{
bli_zgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(dcomplex*)alpha,
(dcomplex*)a, rs_a, cs_a,
x0, incx0,
(dcomplex*)beta,
y0, incy0,
NULL
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
}
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -93,179 +93,5 @@ void PASTEF772(chx,cha,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void sscal_
(
const f77_int* n,
const float* alpha,
float* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
dim_t n0;
float* x0;
inc_t incx0;
/* Initialize BLIS. */
//bli_init_auto();
if (*n == 0 || alpha == NULL) {
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
/* Call BLIS kernel */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(float *)alpha,
x0, incx0,
NULL
);
}
else{
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE,\
n0, \
(float *)alpha,\
x0, incx0,\
NULL, \
NULL \
);\
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
void dscal_
(
const f77_int* n,
const double* alpha,
double* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
dim_t n0;
double* x0;
inc_t incx0;
/* Initialize BLIS */
//bli_init_auto();
if (*n == 0 || alpha == NULL) {
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Convert typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
/* Call BLIS kernel */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen){
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(double*) alpha,
x0, incx0,
NULL
);
}
else{
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE,\
n0, \
(double *)alpha,\
x0, incx0,\
NULL, \
NULL \
);\
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
#else
INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
#endif
#endif

260
frame/compat/bla_scal_amd.c Normal file
View File

@@ -0,0 +1,260 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCSCAL
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
\
void PASTEF772(chx,cha,blasname) \
( \
const f77_int* n, \
const ftype_a* alpha, \
ftype_x* x, const f77_int* incx \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_x alpha_cast; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
if (*n == 0 || alpha == NULL) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
return ; \
} \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
that is, we just always sub-optimally implement those cases
by casting alpha to ctype_x (potentially the complex domain) and
using the homogeneous datatype instance according to that type. */ \
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
&alpha_cast, \
x0, incx0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
void sscal_
(
const f77_int* n,
const float* alpha,
float* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
dim_t n0;
float* x0;
inc_t incx0;
/* Initialize BLIS. */
//bli_init_auto();
if (*n == 0 || alpha == NULL) {
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE) {
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(float *)alpha,
x0, incx0,
NULL
);
}
else{
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE,\
n0, \
(float *)alpha,\
x0, incx0,\
NULL, \
NULL \
);\
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
void dscal_
(
const f77_int* n,
const double* alpha,
double* x, const f77_int* incx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
dim_t n0;
double* x0;
inc_t incx0;
/* Initialize BLIS */
//bli_init_auto();
if (*n == 0 || alpha == NULL) {
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
/* Convert typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE){
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n0,
(double*) alpha,
x0, incx0,
NULL
);
}
else{
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE,\
n0, \
(double *)alpha,\
x0, incx0,\
NULL, \
NULL \
);\
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -83,198 +83,5 @@ void PASTEF77(ch,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_EPYC
void sswap_
(
const f77_int* n,
float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = (y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = (y);
incy0 = ( inc_t )(*incy);
}
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
/* Call BLIS kernel */
bli_sswapv_zen_int8
(
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else{
PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
void dswap_
(
const f77_int* n,
double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = (y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = (y);
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel */
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (bamdzen) {
bli_dswapv_zen_int8
(
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else{
PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
#else
INSERT_GENTFUNC_BLAS( swap, swapv )
#endif
#endif

268
frame/compat/bla_swap_amd.c Normal file
View File

@@ -0,0 +1,268 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
void sswap_
(
const f77_int* n,
float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = (y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = (y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE) {
/* Call BLIS kernel */
bli_sswapv_zen_int8
(
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else{
PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
void dswap_
(
const f77_int* n,
double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = (y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = (y);
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == TRUE) {
bli_dswapv_zen_int8
(
n0,
x0, incx0,
y0, incy0,
NULL
);
}
else{
PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
}
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
#endif

File diff suppressed because it is too large Load Diff

1544
frame/compat/bla_trsm_amd.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -64,16 +64,7 @@ void bli_sscalv_zen_int10
if ( PASTEMAC(s,eq0)( *alpha ) )
{
float* zero = bli_s0;
#ifdef BLIS_CONFIG_EPYC
bli_ssetv_zen_int
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx
);
#else
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
f
(
@@ -83,7 +74,7 @@ void bli_sscalv_zen_int10
x, incx,
cntx
);
#endif
return;
}
@@ -342,16 +333,7 @@ void bli_dscalv_zen_int10
if ( PASTEMAC(d,eq0)( *alpha ) )
{
double* zero = bli_d0;
#ifdef BLIS_CONFIG_EPYC
bli_dsetv_zen_int
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx
);
#else
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
f
@@ -362,7 +344,7 @@ void bli_dscalv_zen_int10
x, incx,
cntx
);
#endif
return;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -95,29 +95,6 @@ void bli_caxpyf_zen_int_4
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
scomplex* a1 = a + (0 )*inca + (i )*lda;
scomplex* chi1 = x + (i )*incx;
scomplex* y1 = y + (0 )*incy;
scomplex alpha_chi1;
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
bli_cscals( *alpha, alpha_chi1 );
bli_caxpyv_zen_int5
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -141,7 +118,6 @@ void bli_caxpyf_zen_int_4
);
}
#endif
return;
}
@@ -357,28 +333,6 @@ void bli_zaxpyf_zen_int_4
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
dcomplex* a1 = a + (0 )*inca + (i )*lda;
dcomplex* chi1 = x + (i )*incx;
dcomplex* y1 = y + (0 )*incy;
dcomplex alpha_chi1;
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
bli_zscals( *alpha, alpha_chi1 );
bli_zaxpyv_zen_int5
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -402,7 +356,6 @@ void bli_zaxpyf_zen_int_4
);
}
#endif
return;
}

View File

@@ -108,29 +108,6 @@ void bli_saxpyf_zen_int_5
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
float* a1 = a + (0 )*inca + (i )*lda;
float* chi1 = x + (i )*incx;
float* y1 = y + (0 )*incy;
float alpha_chi1;
bli_scopycjs( conjx, *chi1, alpha_chi1 );
bli_sscals( *alpha, alpha_chi1 );
bli_saxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -154,7 +131,6 @@ void bli_saxpyf_zen_int_5
);
}
#endif
return;
}
@@ -382,29 +358,6 @@ void bli_daxpyf_zen_int_5
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
bli_dscals( *alpha, alpha_chi1 );
bli_daxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -428,7 +381,6 @@ void bli_daxpyf_zen_int_5
);
}
#endif
return;
}
@@ -655,29 +607,6 @@ static void bli_daxpyf_zen_int_16x2
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
bli_dscals( *alpha, alpha_chi1 );
bli_daxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -701,7 +630,6 @@ static void bli_daxpyf_zen_int_16x2
);
}
#endif
return;
}
@@ -966,43 +894,21 @@ void bli_daxpyf_zen_int_16x4
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
if(b_n & 2)
{
bli_daxpyf_zen_int_16x2( conja,
conjx,
m, 2,
alpha, a, inca, lda,
x, incx,
y, incy,
cntx
);
b_n -= 2;
a += 2*lda;
x += 2 * incx;
}
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
if (b_n & 2)
{
bli_daxpyf_zen_int_16x2( conja,
conjx,
m, 2,
alpha, a, inca, lda,
x, incx,
y, incy,
cntx
);
b_n -= 2;
a += 2*lda;
x += 2 * incx;
}
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
bli_dscals( *alpha, alpha_chi1 );
bli_daxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -1026,7 +932,6 @@ void bli_daxpyf_zen_int_16x4
);
}
#endif
return;
}
@@ -1396,29 +1301,6 @@ void bli_caxpyf_zen_int_5
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
scomplex* a1 = a + (0 )*inca + (i )*lda;
scomplex* chi1 = x + (i )*incx;
scomplex* y1 = y + (0 )*incy;
scomplex alpha_chi1;
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
bli_cscals( *alpha, alpha_chi1 );
bli_caxpyv_zen_int5
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -1442,7 +1324,6 @@ void bli_caxpyf_zen_int_5
);
}
#endif
return;
}
@@ -1810,29 +1691,6 @@ void bli_zaxpyf_zen_int_5
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
dcomplex* a1 = a + (0 )*inca + (i )*lda;
dcomplex* chi1 = x + (i )*incx;
dcomplex* y1 = y + (0 )*incy;
dcomplex alpha_chi1;
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
bli_zscals( *alpha, alpha_chi1 );
bli_zaxpyv_zen_int5
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -1855,8 +1713,7 @@ void bli_zaxpyf_zen_int_5
cntx
);
}
#endif
return;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -97,28 +97,6 @@ void bli_saxpyf_zen_int_6
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_EPYC
for ( i = 0; i < b_n; ++i )
{
float* a1 = a + (0 )*inca + (i )*lda;
float* chi1 = x + (i )*incx;
float* y1 = y + (0 )*incy;
float alpha_chi1;
bli_scopycjs( conjx, *chi1, alpha_chi1 );
bli_sscals( *alpha, alpha_chi1 );
bli_saxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
@@ -141,7 +119,7 @@ void bli_saxpyf_zen_int_6
cntx
);
}
#endif
return;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -114,16 +114,9 @@ err_t bli_gemm_small
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
return BLIS_NOT_YET_IMPLEMENTED;
#else
// When dynamic dispatch is enabled i.e. library is built for 'amdzen' configuration.
// Invoke architecture specific kernels only if we are sure that we are running on zen,
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
arch_t id = bli_arch_query_id();
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
(id == BLIS_ARCH_ZEN3) ||
(id == BLIS_ARCH_ZEN2) ||
(id == BLIS_ARCH_ZEN);
if (0 == bamdzen)
// This function is invoked on all architectures including generic.
// Non-AVX platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx_supported() == FALSE)
{
return BLIS_NOT_YET_IMPLEMENTED;
}