mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Removed Arch specific code from BLIS framework.
- Removed BLIS_CONFIG_EPYC macro
- The code dependent on this macro is handled in
one of the three ways
-- It is updated to work across platforms.
-- Added in architecture/feature specific runtime checks.
-- Duplicated in AMD specific files. Build system is updated to
pick AMD specific files when library is built for any of the
zen architecture
AMD-Internal: [CPUPL-1960]
Change-Id: I6f9f8018e41fa48eb43ae4245c9c2c361857f43b
This commit is contained in:
24
Makefile
24
Makefile
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -212,6 +212,27 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \
|
||||
# Generate object file paths for all of the portable framework source code.
|
||||
MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))
|
||||
|
||||
# AMD has optimized some of the framework files, these optimizations
|
||||
# may not be compatible with other platforms.
|
||||
#
|
||||
# In order to keep main framework code independent of AMD changes,
|
||||
# AMD has duplicated the files and updated them for example
|
||||
# frame/compact/bla_gemm.c : generic framework file
|
||||
# frame/compact/bla_gemm_amd.c : AMD optimized framework file
|
||||
# Based on the archiecture we choose correct files
|
||||
|
||||
ifeq ($(MK_IS_ARCH_ZEN),yes)
|
||||
# Build is being done for AMD platforms, remove the objects which
|
||||
# don't have amd suffix (for which exists AMD specific implementation).
|
||||
MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
|
||||
FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS))
|
||||
MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS))
|
||||
else
|
||||
# Build is done for non AMD platforms, remove the amd specific objects
|
||||
MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
|
||||
MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS))
|
||||
endif
|
||||
|
||||
# Generate object file paths for all of the debgu and trace logger.
|
||||
MK_AOCLDTL_OBJS := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH))
|
||||
|
||||
@@ -1338,4 +1359,3 @@ else
|
||||
@echo "Uninstalling $(@F) from $(@D)/"
|
||||
@- $(RM_F) $@
|
||||
endif
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -204,5 +204,7 @@ MK_ENABLE_AOCL_DYNAMIC := @enable_aocl_dynamic@
|
||||
# BLAS int size
|
||||
MK_BLAS_INT_TYPE_SIZE := @blas_int_type_size@
|
||||
|
||||
MK_IS_ARCH_ZEN := @enable_aocl_zen@
|
||||
|
||||
# end of ifndef CONFIG_MK_INCLUDED conditional block
|
||||
endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -49,16 +49,6 @@ else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# This will add BLIS_CONFIG_EPYC for all framework files
|
||||
# FIXME: framework files should not have architecture specific
|
||||
# checks at least at compile time. Once the macro
|
||||
# is defined it is applicable to every build in the
|
||||
# Family including any non AMD configuration.
|
||||
# However, it is still better to define it in makefiles
|
||||
# instead of headers so we can have slighly more
|
||||
# control on this.
|
||||
COPTFLAGS += -DBLIS_CONFIG_EPYC
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -46,25 +46,12 @@ AMD_CONFIG_FILE := amd_config.mk
|
||||
AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
|
||||
-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
|
||||
|
||||
|
||||
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
|
||||
# add it here at two places,
|
||||
# CPPROCFLAGS = This will enable it for framework code
|
||||
# This flag is used when configure is invoked with specific architecture
|
||||
# CKOPTFLAGS = This will enable it for architecture specific kernels
|
||||
# This flag is used for kernels assocaited with this architecture
|
||||
# irrespective of the configuration it is built for.
|
||||
|
||||
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
|
||||
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] -----------------------
|
||||
#
|
||||
@@ -86,10 +73,6 @@ else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
|
||||
# Add this after updating variables for reference kernels
|
||||
# we don't want this defined for them
|
||||
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -50,15 +50,7 @@ THIS_CONFIG := zen2
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
|
||||
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
|
||||
# add it here at two places,
|
||||
# CPPROCFLAGS = This will enable it for framework code
|
||||
# This flag is used when configure is invoked with specific architecture
|
||||
# CKOPTFLAGS = This will enable it for architecture specific kernels
|
||||
# This flag is used for kernels assocaited with this architecture
|
||||
# irrespective of the configuration it is built for.
|
||||
|
||||
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
@@ -111,10 +103,6 @@ endif
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
|
||||
# Add this after updating variables for reference kernels
|
||||
# we don't want this defined for them
|
||||
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -50,15 +50,7 @@ THIS_CONFIG := zen3
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
|
||||
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
|
||||
# add it here at two places,
|
||||
# CPPROCFLAGS = This will enable it for framework code
|
||||
# This flag is used when configure is invoked with specific architecture
|
||||
# CKOPTFLAGS = This will enable it for architecture specific kernels
|
||||
# This flag is used for kernels assocaited with this architecture
|
||||
# irrespective of the configuration it is built for.
|
||||
|
||||
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
@@ -132,10 +124,6 @@ endif # gcc
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
|
||||
# Add this after updating variables for reference kernels
|
||||
# we don't want this defined for them
|
||||
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -49,15 +49,7 @@ THIS_CONFIG := zen4
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
|
||||
# Since we removed BLIS_CONFIG_EPYC from header file, we need to
|
||||
# add it here at two places,
|
||||
# CPPROCFLAGS = This will enable it for framework code
|
||||
# This flag is used when configure is invoked with specific architecture
|
||||
# CKOPTFLAGS = This will enable it for architecture specific kernels
|
||||
# This flag is used for kernels assocaited with this architecture
|
||||
# irrespective of the configuration it is built for.
|
||||
|
||||
CPPROCFLAGS := -DBLIS_CONFIG_EPYC
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
@@ -131,10 +123,6 @@ endif # gcc
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
|
||||
# Add this after updating variables for reference kernels
|
||||
# we don't want this defined for them
|
||||
CKOPTFLAGS += -DBLIS_CONFIG_EPYC
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
3
configure
vendored
3
configure
vendored
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -3370,6 +3370,7 @@ main()
|
||||
| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \
|
||||
| sed -e "s/@complex_return@/${complex_return}/g" \
|
||||
| sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
|
||||
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \
|
||||
> "${config_mk_out_path}"
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -104,357 +104,5 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void bli_dgemv_unf_var1
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double *A1;
|
||||
double *y1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector X.
|
||||
mem_t mem_bufX;
|
||||
rntm_t rntm;
|
||||
double *x_buf = x;
|
||||
inc_t buf_incx = incx;
|
||||
|
||||
bli_init_once();
|
||||
|
||||
if (cntx == NULL)
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans(transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at);
|
||||
|
||||
conja = bli_extract_conj(transa);
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
double* x1;
|
||||
double* y1;
|
||||
PASTECH(d,dotxf_ker_ft) kfp_df;
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
if (incx > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
|
||||
mem_bufX.pblk.buf = NULL;
|
||||
mem_bufX.pblk.block_size = 0;
|
||||
mem_bufX.buf_type = 0;
|
||||
mem_bufX.size = 0;
|
||||
mem_bufX.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
bli_rntm_init_from_global(&rntm);
|
||||
bli_rntm_set_num_threads_only(1, &rntm);
|
||||
bli_membrk_rntm_set_membrk(&rntm);
|
||||
|
||||
//calculate the size required for n_elem double elements in vector X.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf("bli_dgemv_unf_var1(): get mem pool block\n");
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufX.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufX);
|
||||
|
||||
/*Continue packing X if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc(&mem_bufX)))
|
||||
{
|
||||
x_buf = bli_mem_buffer(&mem_bufX);
|
||||
|
||||
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
|
||||
for (dim_t x_index = 0; x_index < n_elem; x_index++)
|
||||
{
|
||||
*(x_buf + x_index) = *(x + (x_index * incx));
|
||||
}
|
||||
// stride of vector x_buf =1
|
||||
buf_incx = 1;
|
||||
}
|
||||
}
|
||||
|
||||
dim_t fuse_factor = 8;
|
||||
dim_t f_temp =0;
|
||||
|
||||
if (n < 4)
|
||||
{
|
||||
fuse_factor = 2;
|
||||
} else if (n < 8)
|
||||
{
|
||||
fuse_factor = 4;
|
||||
}
|
||||
|
||||
|
||||
for (i = 0; i < n_iter; i += f)
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
|
||||
|
||||
//A = a + i * row_increment + 0 * column_increment
|
||||
A1 = a + (i)*rs_at;
|
||||
y1 = y + (i)*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
switch (f)
|
||||
{
|
||||
case 8:
|
||||
|
||||
bli_ddotxf_zen_int_8(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
if (f < 4)
|
||||
{
|
||||
bli_ddotxf_zen_int_2(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ddotxf_zen_int_4(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
}
|
||||
}
|
||||
|
||||
f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
|
||||
|
||||
if (f_temp < fuse_factor)
|
||||
{
|
||||
switch (fuse_factor)
|
||||
{
|
||||
case 8:
|
||||
fuse_factor = 4;
|
||||
break;
|
||||
case 4:
|
||||
fuse_factor = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm, &mem_bufX);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_sgemv_unf_var1
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
float* beta,
|
||||
float* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_init_once();
|
||||
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(s,type);
|
||||
float* x1 ;
|
||||
PASTECH(s,dotxf_ker_ft) kfp_df;
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_sdotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
440
frame/2/gemv/bli_gemv_unf_var1_amd.c
Normal file
440
frame/2/gemv/bli_gemv_unf_var1_amd.c
Normal file
@@ -0,0 +1,440 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* y, inc_t incy, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
\
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* A1; \
|
||||
ctype* x1; \
|
||||
ctype* y1; \
|
||||
dim_t i; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_elem, n_iter; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conja; \
|
||||
\
|
||||
bli_set_dims_incs_with_trans( transa, \
|
||||
m, n, rs_a, cs_a, \
|
||||
&n_iter, &n_elem, &rs_at, &cs_at ); \
|
||||
\
|
||||
conja = bli_extract_conj( transa ); \
|
||||
\
|
||||
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
\
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x + (0 )*incy; \
|
||||
y1 = y + (i )*incy; \
|
||||
\
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */ \
|
||||
kfp_df \
|
||||
( \
|
||||
conja, \
|
||||
conjx, \
|
||||
n_elem, \
|
||||
f, \
|
||||
alpha, \
|
||||
A1, cs_at, rs_at, \
|
||||
x1, incx, \
|
||||
beta, \
|
||||
y1, incy, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
void bli_dgemv_unf_var1
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double *A1;
|
||||
double *y1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector X.
|
||||
mem_t mem_bufX;
|
||||
rntm_t rntm;
|
||||
double *x_buf = x;
|
||||
inc_t buf_incx = incx;
|
||||
|
||||
bli_init_once();
|
||||
|
||||
if (cntx == NULL)
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans(transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at);
|
||||
|
||||
conja = bli_extract_conj(transa);
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
double* x1;
|
||||
double* y1;
|
||||
PASTECH(d,dotxf_ker_ft) kfp_df;
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
if (incx > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
|
||||
mem_bufX.pblk.buf = NULL;
|
||||
mem_bufX.pblk.block_size = 0;
|
||||
mem_bufX.buf_type = 0;
|
||||
mem_bufX.size = 0;
|
||||
mem_bufX.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
bli_rntm_init_from_global(&rntm);
|
||||
bli_rntm_set_num_threads_only(1, &rntm);
|
||||
bli_membrk_rntm_set_membrk(&rntm);
|
||||
|
||||
//calculate the size required for n_elem double elements in vector X.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf("bli_dgemv_unf_var1(): get mem pool block\n");
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufX.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufX);
|
||||
|
||||
/*Continue packing X if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc(&mem_bufX)))
|
||||
{
|
||||
x_buf = bli_mem_buffer(&mem_bufX);
|
||||
|
||||
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
|
||||
for (dim_t x_index = 0; x_index < n_elem; x_index++)
|
||||
{
|
||||
*(x_buf + x_index) = *(x + (x_index * incx));
|
||||
}
|
||||
// stride of vector x_buf =1
|
||||
buf_incx = 1;
|
||||
}
|
||||
}
|
||||
|
||||
dim_t fuse_factor = 8;
|
||||
dim_t f_temp =0;
|
||||
|
||||
if (n < 4)
|
||||
{
|
||||
fuse_factor = 2;
|
||||
} else if (n < 8)
|
||||
{
|
||||
fuse_factor = 4;
|
||||
}
|
||||
|
||||
for (i = 0; i < n_iter; i += f)
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
|
||||
|
||||
//A = a + i * row_increment + 0 * column_increment
|
||||
A1 = a + (i)*rs_at;
|
||||
y1 = y + (i)*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
switch (f)
|
||||
{
|
||||
case 8:
|
||||
|
||||
bli_ddotxf_zen_int_8(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
if (f < 4)
|
||||
{
|
||||
bli_ddotxf_zen_int_2(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ddotxf_zen_int_4(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx);
|
||||
}
|
||||
}
|
||||
|
||||
f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
|
||||
|
||||
if (f_temp < fuse_factor)
|
||||
{
|
||||
switch (fuse_factor)
|
||||
{
|
||||
case 8:
|
||||
fuse_factor = 4;
|
||||
break;
|
||||
case 4:
|
||||
fuse_factor = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm, &mem_bufX);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_sgemv_unf_var1
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
float* beta,
|
||||
float* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_init_once();
|
||||
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(s,type);
|
||||
float* x1 ;
|
||||
PASTECH(s,dotxf_ker_ft) kfp_df;
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_sdotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -137,764 +137,4 @@ void PASTEMAC(ch,varname) \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void bli_dgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
double* A1;
|
||||
double* x1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double *y_buf = y;
|
||||
inc_t buf_incy = incy;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
double* x1;
|
||||
double* y1;
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
double* zero = PASTEMAC(d,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_deq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
if (incy > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
|
||||
mem_bufY.buf_type = 0; mem_bufY.size = 0;
|
||||
mem_bufY.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
bli_rntm_init_from_global( &rntm );
|
||||
bli_rntm_set_num_threads_only( 1, &rntm );
|
||||
bli_membrk_rntm_set_membrk( &rntm );
|
||||
|
||||
//calculate the size required for n_elem double elements in vector Y.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufY.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY);
|
||||
|
||||
/*Continue packing Y if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc( &mem_bufY )))
|
||||
{
|
||||
y_buf = bli_mem_buffer(&mem_bufY);
|
||||
|
||||
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y_buf + y_index) = *(y + (y_index * incy)) ;
|
||||
}
|
||||
// stride of vector y_buf =1
|
||||
buf_incy = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_daxpyf_zen_int_16x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y_buf, buf_incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
|
||||
{
|
||||
//store the result from unit strided y_buf to non-unit strided Y
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y + (y_index * incy)) = *(y_buf + y_index) ;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm , &mem_bufY);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_sgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
float* beta,
|
||||
float* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(s,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(s,eq0)( *beta ) )
|
||||
{
|
||||
float* zero = PASTEMAC(s,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(s,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_seq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 6;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_saxpyf_zen_int_6
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
void bli_zgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dcomplex* alpha,
|
||||
dcomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
dcomplex* x, inc_t incx,
|
||||
dcomplex* beta,
|
||||
dcomplex* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
dcomplex* A1;
|
||||
dcomplex* x1;
|
||||
dcomplex* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/* bli_zscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y,
|
||||
incy,
|
||||
cntx
|
||||
);*/
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(z,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(z,eq0)( *beta ) )
|
||||
{
|
||||
dcomplex* zero = PASTEMAC(z,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(z,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
bli_zscalv_ex
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_zeq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
|
||||
if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
|
||||
!bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
|
||||
{
|
||||
// This gemv code deals with the followint conditions only
|
||||
// 1. incx, incy, and row stride equal to one
|
||||
// 2. Non conjugate A matrix and X vector
|
||||
// 3. No Transpose for A Martix
|
||||
// Rest is taken care by the else part (axpyf implementation)
|
||||
bli_zgemv_zen_int_4x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_at, cs_at,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fusing factor */
|
||||
b_fuse = 4;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_zaxpyf_zen_int_4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_cgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
scomplex* alpha,
|
||||
scomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
scomplex* x, inc_t incx,
|
||||
scomplex* beta,
|
||||
scomplex* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
scomplex* A1;
|
||||
scomplex* x1;
|
||||
scomplex* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/*bli_cscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y,
|
||||
incy,
|
||||
cntx
|
||||
);*/
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
const num_t dt = PASTEMAC(c,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(c,eq0)( *beta ) )
|
||||
{
|
||||
scomplex* zero = PASTEMAC(c,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(c,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
bli_cscalv_ex
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
|
||||
|
||||
if( bli_ceq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
|
||||
if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
|
||||
!bli_is_conj(conja) && !bli_is_conj(conjx) &&
|
||||
!bli_is_trans(transa))
|
||||
{
|
||||
// This gemv code deals with the followint conditions only
|
||||
// 1. incx, incy, and row stride equal to one
|
||||
// 2. Non conjugate A matrix and X vector
|
||||
// 3. No Transpose for A Martix
|
||||
// Rest is taken care by the else part (axpyf implementation)
|
||||
bli_cgemv_zen_int_4x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_at, cs_at,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fusing factor. */
|
||||
b_fuse = 4;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_caxpyf_zen_int_4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
|
||||
#endif
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
|
||||
879
frame/2/gemv/bli_gemv_unf_var2_amd.c
Normal file
879
frame/2/gemv/bli_gemv_unf_var2_amd.c
Normal file
@@ -0,0 +1,879 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#define BLIS_DGEMV_VAR2_FUSE 4
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* y, inc_t incy, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
\
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); \
|
||||
\
|
||||
bli_init_once(); \
|
||||
\
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* zero = PASTEMAC(ch,0); \
|
||||
ctype* A1; \
|
||||
ctype* x1; \
|
||||
ctype* y1; \
|
||||
dim_t i; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_elem, n_iter; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conja; \
|
||||
\
|
||||
bli_set_dims_incs_with_trans( transa, \
|
||||
m, n, rs_a, cs_a, \
|
||||
&n_elem, &n_iter, &rs_at, &cs_at ); \
|
||||
\
|
||||
conja = bli_extract_conj( transa ); \
|
||||
\
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
/* y = 0; */ \
|
||||
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_elem, \
|
||||
zero, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* y = beta * y; */ \
|
||||
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_elem, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
PASTECH(ch,axpyf_ker_ft) kfp_af; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
\
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at; \
|
||||
x1 = x + (i )*incx; \
|
||||
y1 = y + (0 )*incy; \
|
||||
\
|
||||
/* y = y + alpha * A1 * x1; */ \
|
||||
kfp_af \
|
||||
( \
|
||||
conja, \
|
||||
conjx, \
|
||||
n_elem, \
|
||||
f, \
|
||||
alpha, \
|
||||
A1, rs_at, cs_at, \
|
||||
x1, incx, \
|
||||
y1, incy, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
|
||||
}
|
||||
|
||||
void bli_dgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
double* A1;
|
||||
double* x1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double *y_buf = y;
|
||||
inc_t buf_incy = incy;
|
||||
|
||||
// For AMD these APIS are invoked skipping intermediate framework layers
|
||||
// Hence we need to ensure that cntx is set here.
|
||||
bli_init_once();
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
double* x1;
|
||||
double* y1;
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
double* zero = PASTEMAC(d,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
if( bli_deq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
if (incy > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
|
||||
mem_bufY.buf_type = 0; mem_bufY.size = 0;
|
||||
mem_bufY.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
bli_rntm_init_from_global( &rntm );
|
||||
bli_rntm_set_num_threads_only( 1, &rntm );
|
||||
bli_membrk_rntm_set_membrk( &rntm );
|
||||
|
||||
//calculate the size required for n_elem double elements in vector Y.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufY.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY);
|
||||
|
||||
/*Continue packing Y if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc( &mem_bufY )))
|
||||
{
|
||||
y_buf = bli_mem_buffer(&mem_bufY);
|
||||
|
||||
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y_buf + y_index) = *(y + (y_index * incy)) ;
|
||||
}
|
||||
// stride of vector y_buf =1
|
||||
buf_incy = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_daxpyf_zen_int_16x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y_buf, buf_incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
|
||||
{
|
||||
//store the result from unit strided y_buf to non-unit strided Y
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y + (y_index * incy)) = *(y_buf + y_index) ;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm , &mem_bufY);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_sgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
float* beta,
|
||||
float* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// For AMD these APIS are invoked skipping intermediate framework layers
|
||||
// Hence we need to ensure that cntx is set here.
|
||||
bli_init_once();
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
const num_t dt = PASTEMAC(s,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(s,eq0)( *beta ) )
|
||||
{
|
||||
float* zero = PASTEMAC(s,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(s,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
if( bli_seq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 6;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_saxpyf_zen_int_6
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
void bli_zgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dcomplex* alpha,
|
||||
dcomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
dcomplex* x, inc_t incx,
|
||||
dcomplex* beta,
|
||||
dcomplex* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
dcomplex* A1;
|
||||
dcomplex* x1;
|
||||
dcomplex* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// For AMD these APIS are invoked skipping intermediate framework layers
|
||||
// Hence we need to ensure that cntx is set here.
|
||||
bli_init_once();
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/* bli_zscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y,
|
||||
incy,
|
||||
cntx
|
||||
);*/
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
const num_t dt = PASTEMAC(z,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(z,eq0)( *beta ) )
|
||||
{
|
||||
dcomplex* zero = PASTEMAC(z,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(z,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
bli_zscalv_ex
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_zeq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
|
||||
if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
|
||||
!bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
|
||||
{
|
||||
// This gemv code deals with the followint conditions only
|
||||
// 1. incx, incy, and row stride equal to one
|
||||
// 2. Non conjugate A matrix and X vector
|
||||
// 3. No Transpose for A Martix
|
||||
// Rest is taken care by the else part (axpyf implementation)
|
||||
bli_zgemv_zen_int_4x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_at, cs_at,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fusing factor */
|
||||
b_fuse = 4;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_zaxpyf_zen_int_4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_cgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
scomplex* alpha,
|
||||
scomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
scomplex* x, inc_t incx,
|
||||
scomplex* beta,
|
||||
scomplex* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
scomplex* A1;
|
||||
scomplex* x1;
|
||||
scomplex* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// For AMD these APIS are invoked skipping intermediate framework layers
|
||||
// Hence we need to ensure that cntx is set here.
|
||||
bli_init_once();
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/*bli_cscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y,
|
||||
incy,
|
||||
cntx
|
||||
);*/
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
const num_t dt = PASTEMAC(c,type);
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(c,eq0)( *beta ) )
|
||||
{
|
||||
scomplex* zero = PASTEMAC(c,0);
|
||||
/* y = 0; */
|
||||
PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(c,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
bli_cscalv_ex
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
|
||||
|
||||
if( bli_ceq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
// for non-unit incx, incy and rs_at and conjugate will be added in the next patch
|
||||
if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
|
||||
!bli_is_conj(conja) && !bli_is_conj(conjx) &&
|
||||
!bli_is_trans(transa))
|
||||
{
|
||||
// This gemv code deals with the followint conditions only
|
||||
// 1. incx, incy, and row stride equal to one
|
||||
// 2. Non conjugate A matrix and X vector
|
||||
// 3. No Transpose for A Martix
|
||||
// Rest is taken care by the else part (axpyf implementation)
|
||||
bli_cgemv_zen_int_4x4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_at, cs_at,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* fusing factor. */
|
||||
b_fuse = 4;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_caxpyf_zen_int_4
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -216,207 +216,5 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void bli_post_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var1
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* zero = PASTEMAC(d,0);
|
||||
double* A10;
|
||||
double* A11;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a21;
|
||||
double* x0;
|
||||
double* x1;
|
||||
double* chi11;
|
||||
double* y0;
|
||||
double* y1;
|
||||
double* y01;
|
||||
double* psi11;
|
||||
double* y21;
|
||||
double conjx_chi11;
|
||||
double alpha_chi11;
|
||||
double alpha11_temp;
|
||||
dim_t i, k, j;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind;
|
||||
dim_t f_ahead, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conj0 = 0, conj1 = 0;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case;the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters. */
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing
|
||||
* factor. */
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = ((id == BLIS_ARCH_ZEN4) ||(id == BLIS_ARCH_ZEN3)
|
||||
|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
|
||||
if (bamdzen)
|
||||
{
|
||||
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_dotxaxpyf_ker =
|
||||
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
|
||||
b_fuse =
|
||||
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
|
||||
}
|
||||
|
||||
for ( i = 0; i < m; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
|
||||
n_behind = i;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
x0 = x + (0 )*incx;
|
||||
x1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = y1 + alpha * A10 * x0; (dotxf) */
|
||||
/* y0 = y0 + alpha * A10' * x1; (axpyf) */
|
||||
kfp_dotxaxpyf_ker
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
conjx,
|
||||
conjx,
|
||||
n_behind,
|
||||
f,
|
||||
alpha,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
x1, incx,
|
||||
one,
|
||||
y1, incy,
|
||||
y0, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
|
||||
if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
|
||||
{
|
||||
/*this helper function handles unit stride only*/
|
||||
bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
f_behind = k;
|
||||
f_ahead = f - k - 1;
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at;
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
|
||||
chi11 = x1 + (k )*incx;
|
||||
y01 = y1 + (0 )*incy;
|
||||
psi11 = y1 + (k )*incy;
|
||||
y21 = y1 + (k+1)*incy;
|
||||
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi11,
|
||||
conjx_chi11 );
|
||||
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
|
||||
alpha_chi11 );
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a10t + j*cs_at),
|
||||
*(y01 + j*incy) );
|
||||
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11,
|
||||
alpha11_temp );
|
||||
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */
|
||||
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
|
||||
*psi11 );
|
||||
|
||||
/* y21 = y21 + alpha * a21 * chi11; */
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a21 + j*rs_at),
|
||||
*(y21 + j*incy) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
GENTFUNC(float, s, hemv_unf_var1)
|
||||
GENTFUNC(scomplex, c, hemv_unf_var1)
|
||||
GENTFUNC(dcomplex, z, hemv_unf_var1)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( hemv_unf_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
418
frame/2/hemv/bli_hemv_unf_var1_amd.c
Normal file
418
frame/2/hemv/bli_hemv_unf_var1_amd.c
Normal file
@@ -0,0 +1,418 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
uplo_t uplo, \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
conj_t conjh, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* y, inc_t incy, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* zero = PASTEMAC(ch,0); \
|
||||
ctype* A10; \
|
||||
ctype* A11; \
|
||||
ctype* a10t; \
|
||||
ctype* alpha11; \
|
||||
ctype* a21; \
|
||||
ctype* x0; \
|
||||
ctype* x1; \
|
||||
ctype* chi11; \
|
||||
ctype* y0; \
|
||||
ctype* y1; \
|
||||
ctype* y01; \
|
||||
ctype* psi11; \
|
||||
ctype* y21; \
|
||||
ctype conjx_chi11; \
|
||||
ctype alpha_chi11; \
|
||||
ctype alpha11_temp; \
|
||||
dim_t i, k, j; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_behind; \
|
||||
dim_t f_ahead, f_behind; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conj0, conj1; \
|
||||
\
|
||||
/* The algorithm will be expressed in terms of the lower triangular case;
|
||||
the upper triangular case is supported by swapping the row and column
|
||||
strides of A and toggling some conj parameters. */ \
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
rs_at = rs_a; \
|
||||
cs_at = cs_a; \
|
||||
\
|
||||
conj0 = conja; \
|
||||
conj1 = bli_apply_conj( conjh, conja ); \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
rs_at = cs_a; \
|
||||
cs_at = rs_a; \
|
||||
\
|
||||
conj0 = bli_apply_conj( conjh, conja ); \
|
||||
conj1 = conja; \
|
||||
} \
|
||||
\
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
/* y = 0; */ \
|
||||
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m, \
|
||||
zero, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* y = beta * y; */ \
|
||||
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
|
||||
n_behind = i; \
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at; \
|
||||
A11 = a + (i )*rs_at + (i )*cs_at; \
|
||||
x0 = x + (0 )*incx; \
|
||||
x1 = x + (i )*incx; \
|
||||
y0 = y + (0 )*incy; \
|
||||
y1 = y + (i )*incy; \
|
||||
\
|
||||
/* y1 = y1 + alpha * A10 * x0; (dotxf) */ \
|
||||
/* y0 = y0 + alpha * A10' * x1; (axpyf) */ \
|
||||
kfp_xf \
|
||||
( \
|
||||
conj0, \
|
||||
conj1, \
|
||||
conjx, \
|
||||
conjx, \
|
||||
n_behind, \
|
||||
f, \
|
||||
alpha, \
|
||||
A10, cs_at, rs_at, \
|
||||
x0, incx, \
|
||||
x1, incx, \
|
||||
one, \
|
||||
y1, incy, \
|
||||
y0, incy, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
|
||||
for ( k = 0; k < f; ++k ) \
|
||||
{ \
|
||||
f_behind = k; \
|
||||
f_ahead = f - k - 1; \
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
|
||||
chi11 = x1 + (k )*incx; \
|
||||
y01 = y1 + (0 )*incy; \
|
||||
psi11 = y1 + (k )*incy; \
|
||||
y21 = y1 + (k+1)*incy; \
|
||||
\
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */ \
|
||||
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
|
||||
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
|
||||
if ( bli_is_conj( conj1 ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
|
||||
} \
|
||||
\
|
||||
/* For hemv, explicitly set the imaginary component of alpha11 to
|
||||
zero. */ \
|
||||
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
|
||||
if ( bli_is_conj( conjh ) ) \
|
||||
PASTEMAC(ch,seti0s)( alpha11_temp ); \
|
||||
\
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
|
||||
\
|
||||
/* y21 = y21 + alpha * a21 * chi11; */ \
|
||||
if ( bli_is_conj( conj0 ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_ahead; ++j ) \
|
||||
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_ahead; ++j ) \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
void bli_post_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var1
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* zero = PASTEMAC(d,0);
|
||||
double* A10;
|
||||
double* A11;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a21;
|
||||
double* x0;
|
||||
double* x1;
|
||||
double* chi11;
|
||||
double* y0;
|
||||
double* y1;
|
||||
double* y01;
|
||||
double* psi11;
|
||||
double* y21;
|
||||
double conjx_chi11;
|
||||
double alpha_chi11;
|
||||
double alpha11_temp;
|
||||
dim_t i, k, j;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind;
|
||||
dim_t f_ahead, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conj0 = 0, conj1 = 0;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case;the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters. */
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing
|
||||
* factor. */
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_dotxaxpyf_ker =
|
||||
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
|
||||
b_fuse =
|
||||
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
|
||||
}
|
||||
|
||||
for ( i = 0; i < m; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
|
||||
n_behind = i;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
x0 = x + (0 )*incx;
|
||||
x1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = y1 + alpha * A10 * x0; (dotxf) */
|
||||
/* y0 = y0 + alpha * A10' * x1; (axpyf) */
|
||||
kfp_dotxaxpyf_ker
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
conjx,
|
||||
conjx,
|
||||
n_behind,
|
||||
f,
|
||||
alpha,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
x1, incx,
|
||||
one,
|
||||
y1, incy,
|
||||
y0, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
|
||||
if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
|
||||
{
|
||||
/*this helper function handles unit stride only*/
|
||||
bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
f_behind = k;
|
||||
f_ahead = f - k - 1;
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at;
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
|
||||
chi11 = x1 + (k )*incx;
|
||||
y01 = y1 + (0 )*incy;
|
||||
psi11 = y1 + (k )*incy;
|
||||
y21 = y1 + (k+1)*incy;
|
||||
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi11,
|
||||
conjx_chi11 );
|
||||
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
|
||||
alpha_chi11 );
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a10t + j*cs_at),
|
||||
*(y01 + j*incy) );
|
||||
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11,
|
||||
alpha11_temp );
|
||||
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */
|
||||
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
|
||||
*psi11 );
|
||||
|
||||
/* y21 = y21 + alpha * a21 * chi11; */
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a21 + j*rs_at),
|
||||
*(y21 + j*incy) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
GENTFUNC(float, s, hemv_unf_var1)
|
||||
GENTFUNC(scomplex, c, hemv_unf_var1)
|
||||
GENTFUNC(dcomplex, z, hemv_unf_var1)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -216,210 +216,6 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void bli_pre_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var3
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* zero = PASTEMAC(d,0);
|
||||
double* A11;
|
||||
double* A21;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a21;
|
||||
double* x1;
|
||||
double* x2;
|
||||
double* chi11;
|
||||
double* y1;
|
||||
double* y2;
|
||||
double* y01;
|
||||
double* psi11;
|
||||
double* y21;
|
||||
double conjx_chi11;
|
||||
double alpha_chi11;
|
||||
double alpha11_temp;
|
||||
dim_t i, k, j;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead;
|
||||
dim_t f_ahead, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conj0 = 0, conj1 = 0;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case; the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters. */
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
|
||||
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = ((id == BLIS_ARCH_ZEN4) || (id == BLIS_ARCH_ZEN3)
|
||||
|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
|
||||
if (bamdzen)
|
||||
{
|
||||
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_dotxaxpyf_ker =
|
||||
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
|
||||
b_fuse =
|
||||
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
|
||||
}
|
||||
|
||||
for ( i = 0; i < m; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
|
||||
n_ahead = m - i - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
y1 = y + (i )*incy;
|
||||
y2 = y + (i+f)*incy;
|
||||
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
|
||||
if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
|
||||
{
|
||||
/*this helper function handles unit stride only*/
|
||||
bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
f_behind = k;
|
||||
f_ahead = f - k - 1;
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at;
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
|
||||
chi11 = x1 + (k )*incx;
|
||||
y01 = y1 + (0 )*incy;
|
||||
psi11 = y1 + (k )*incy;
|
||||
y21 = y1 + (k+1)*incy;
|
||||
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */
|
||||
PASTEMAC(d,copycjs)( conjx,
|
||||
*chi11, conjx_chi11 );
|
||||
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
|
||||
alpha_chi11 );
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)
|
||||
( alpha_chi11,
|
||||
*(a10t + j*cs_at),
|
||||
*(y01 + j*incy) );
|
||||
}
|
||||
}
|
||||
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11,
|
||||
alpha11_temp );
|
||||
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */
|
||||
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
|
||||
*psi11 );
|
||||
|
||||
/* y21 = y21 + alpha * a21 * chi11; */
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a21 + j*rs_at),
|
||||
*(y21 + j*incy) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* y1 = y1 + alpha * A21' * x2; (dotxf) */
|
||||
/* y2 = y2 + alpha * A21 * x1; (axpyf) */
|
||||
kfp_dotxaxpyf_ker
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
conjx,
|
||||
conjx,
|
||||
n_ahead,
|
||||
f,
|
||||
alpha,
|
||||
A21, rs_at, cs_at,
|
||||
x2, incx,
|
||||
x1, incx,
|
||||
one,
|
||||
y1, incy,
|
||||
y2, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, hemv_unf_var3)
|
||||
GENTFUNC(scomplex, c, hemv_unf_var3)
|
||||
GENTFUNC(dcomplex, z, hemv_unf_var3)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( hemv_unf_var3 )
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
420
frame/2/hemv/bli_hemv_unf_var3_amd.c
Normal file
420
frame/2/hemv/bli_hemv_unf_var3_amd.c
Normal file
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
uplo_t uplo, \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
conj_t conjh, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* y, inc_t incy, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* zero = PASTEMAC(ch,0); \
|
||||
ctype* A11; \
|
||||
ctype* A21; \
|
||||
ctype* a10t; \
|
||||
ctype* alpha11; \
|
||||
ctype* a21; \
|
||||
ctype* x1; \
|
||||
ctype* x2; \
|
||||
ctype* chi11; \
|
||||
ctype* y1; \
|
||||
ctype* y2; \
|
||||
ctype* y01; \
|
||||
ctype* psi11; \
|
||||
ctype* y21; \
|
||||
ctype conjx_chi11; \
|
||||
ctype alpha_chi11; \
|
||||
ctype alpha11_temp; \
|
||||
dim_t i, k, j; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_ahead; \
|
||||
dim_t f_ahead, f_behind; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conj0, conj1; \
|
||||
\
|
||||
/* The algorithm will be expressed in terms of the lower triangular case;
|
||||
the upper triangular case is supported by swapping the row and column
|
||||
strides of A and toggling some conj parameters. */ \
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
rs_at = rs_a; \
|
||||
cs_at = cs_a; \
|
||||
\
|
||||
conj0 = bli_apply_conj( conjh, conja ); \
|
||||
conj1 = conja; \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
rs_at = cs_a; \
|
||||
cs_at = rs_a; \
|
||||
\
|
||||
conj0 = conja; \
|
||||
conj1 = bli_apply_conj( conjh, conja ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
/* y = 0; */ \
|
||||
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m, \
|
||||
zero, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* y = beta * y; */ \
|
||||
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
|
||||
n_ahead = m - i - f; \
|
||||
A11 = a + (i )*rs_at + (i )*cs_at; \
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at; \
|
||||
x1 = x + (i )*incx; \
|
||||
x2 = x + (i+f)*incx; \
|
||||
y1 = y + (i )*incy; \
|
||||
y2 = y + (i+f)*incy; \
|
||||
\
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
|
||||
for ( k = 0; k < f; ++k ) \
|
||||
{ \
|
||||
f_behind = k; \
|
||||
f_ahead = f - k - 1; \
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
|
||||
chi11 = x1 + (k )*incx; \
|
||||
y01 = y1 + (0 )*incy; \
|
||||
psi11 = y1 + (k )*incy; \
|
||||
y21 = y1 + (k+1)*incy; \
|
||||
\
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */ \
|
||||
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
|
||||
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
|
||||
if ( bli_is_conj( conj0 ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
|
||||
} \
|
||||
\
|
||||
/* For hemv, explicitly set the imaginary component of alpha11 to
|
||||
zero. */ \
|
||||
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
|
||||
if ( bli_is_conj( conjh ) ) \
|
||||
PASTEMAC(ch,seti0s)( alpha11_temp ); \
|
||||
\
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
|
||||
\
|
||||
/* y21 = y21 + alpha * a21 * chi11; */ \
|
||||
if ( bli_is_conj( conj1 ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_ahead; ++j ) \
|
||||
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_ahead; ++j ) \
|
||||
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* y1 = y1 + alpha * A21' * x2; (dotxf) */ \
|
||||
/* y2 = y2 + alpha * A21 * x1; (axpyf) */ \
|
||||
kfp_xf \
|
||||
( \
|
||||
conj0, \
|
||||
conj1, \
|
||||
conjx, \
|
||||
conjx, \
|
||||
n_ahead, \
|
||||
f, \
|
||||
alpha, \
|
||||
A21, rs_at, cs_at, \
|
||||
x2, incx, \
|
||||
x1, incx, \
|
||||
one, \
|
||||
y1, incy, \
|
||||
y2, incy, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
void bli_pre_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var3
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* zero = PASTEMAC(d,0);
|
||||
double* A11;
|
||||
double* A21;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a21;
|
||||
double* x1;
|
||||
double* x2;
|
||||
double* chi11;
|
||||
double* y1;
|
||||
double* y2;
|
||||
double* y01;
|
||||
double* psi11;
|
||||
double* y21;
|
||||
double conjx_chi11;
|
||||
double alpha_chi11;
|
||||
double alpha11_temp;
|
||||
dim_t i, k, j;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead;
|
||||
dim_t f_ahead, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conj0 = 0, conj1 = 0;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case; the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters. */
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
}
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
/* y = 0; */
|
||||
PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
zero,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y = beta * y; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_dotxaxpyf_ker =
|
||||
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
|
||||
b_fuse =
|
||||
bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
|
||||
}
|
||||
|
||||
for ( i = 0; i < m; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse );
|
||||
n_ahead = m - i - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
y1 = y + (i )*incy;
|
||||
y2 = y + (i+f)*incy;
|
||||
|
||||
/* y1 = y1 + alpha * A11 * x1; (variant 4) */
|
||||
if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
|
||||
{
|
||||
/*this helper function handles unit stride only*/
|
||||
bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
f_behind = k;
|
||||
f_ahead = f - k - 1;
|
||||
a10t = A11 + (k )*rs_at + (0 )*cs_at;
|
||||
alpha11 = A11 + (k )*rs_at + (k )*cs_at;
|
||||
a21 = A11 + (k+1)*rs_at + (k )*cs_at;
|
||||
chi11 = x1 + (k )*incx;
|
||||
y01 = y1 + (0 )*incy;
|
||||
psi11 = y1 + (k )*incy;
|
||||
y21 = y1 + (k+1)*incy;
|
||||
|
||||
/* y01 = y01 + alpha * a10t' * chi11; */
|
||||
PASTEMAC(d,copycjs)( conjx,
|
||||
*chi11, conjx_chi11 );
|
||||
PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
|
||||
alpha_chi11 );
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)
|
||||
( alpha_chi11,
|
||||
*(a10t + j*cs_at),
|
||||
*(y01 + j*incy) );
|
||||
}
|
||||
}
|
||||
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11,
|
||||
alpha11_temp );
|
||||
|
||||
/* psi11 = psi11 + alpha * alpha11 * chi11; */
|
||||
PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
|
||||
*psi11 );
|
||||
|
||||
/* y21 = y21 + alpha * a21 * chi11; */
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
{
|
||||
PASTEMAC(d,axpys)( alpha_chi11,
|
||||
*(a21 + j*rs_at),
|
||||
*(y21 + j*incy) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* y1 = y1 + alpha * A21' * x2; (dotxf) */
|
||||
/* y2 = y2 + alpha * A21 * x1; (axpyf) */
|
||||
kfp_dotxaxpyf_ker
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
conjx,
|
||||
conjx,
|
||||
n_ahead,
|
||||
f,
|
||||
alpha,
|
||||
A21, rs_at, cs_at,
|
||||
x2, incx,
|
||||
x1, incx,
|
||||
one,
|
||||
y1, incy,
|
||||
y2, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, hemv_unf_var3)
|
||||
GENTFUNC(scomplex, c, hemv_unf_var3)
|
||||
GENTFUNC(dcomplex, z, hemv_unf_var3)
|
||||
|
||||
|
||||
@@ -158,217 +158,5 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_trans_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var1
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* x0;
|
||||
double* chi1;
|
||||
double* y0;
|
||||
double* psi1;
|
||||
double* c10t;
|
||||
double* gamma11;
|
||||
double alpha0;
|
||||
double alpha1;
|
||||
double alpha0_chi1;
|
||||
double alpha1_psi1;
|
||||
double alpha0_chi1_psi1;
|
||||
double conjx0_chi1;
|
||||
double conjy1_psi1;
|
||||
double conjy0_psi1;
|
||||
dim_t i;
|
||||
dim_t n_behind;
|
||||
inc_t rs_ct, cs_ct;
|
||||
conj_t conj0, conj1;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case;the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
conjx = bli_apply_conj( conjh, conjx );
|
||||
conjy = bli_apply_conj( conjh, conjy );
|
||||
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
|
||||
PASTEMAC(d,copys)( *alpha, alpha1 );
|
||||
}
|
||||
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
conj0 = bli_apply_conj( conjh, conjy );
|
||||
conj1 = bli_apply_conj( conjh, conjx );
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
if((n_behind >= 3))
|
||||
{
|
||||
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
|
||||
i+=4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var1)
|
||||
GENTFUNC(scomplex, c, her2_unf_var1)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var1)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
369
frame/2/her2/bli_her2_unf_var1_amd.c
Normal file
369
frame/2/her2/bli_her2_unf_var1_amd.c
Normal file
@@ -0,0 +1,369 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
uplo_t uplo, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
conj_t conjh, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* y, inc_t incy, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* x0; \
|
||||
ctype* chi1; \
|
||||
ctype* y0; \
|
||||
ctype* psi1; \
|
||||
ctype* c10t; \
|
||||
ctype* gamma11; \
|
||||
ctype alpha0; \
|
||||
ctype alpha1; \
|
||||
ctype alpha0_chi1; \
|
||||
ctype alpha1_psi1; \
|
||||
ctype alpha0_chi1_psi1; \
|
||||
ctype conjx0_chi1; \
|
||||
ctype conjy1_psi1; \
|
||||
ctype conjy0_psi1; \
|
||||
dim_t i; \
|
||||
dim_t n_behind; \
|
||||
inc_t rs_ct, cs_ct; \
|
||||
conj_t conj0, conj1; \
|
||||
\
|
||||
/* The algorithm will be expressed in terms of the lower triangular case;
|
||||
the upper triangular case is supported by swapping the row and column
|
||||
strides of A and toggling some conj parameters. */ \
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
rs_ct = rs_c; \
|
||||
cs_ct = cs_c; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
|
||||
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
rs_ct = cs_c; \
|
||||
cs_ct = rs_c; \
|
||||
\
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
|
||||
as her2; for syr2, conjx/conjy are unchanged. */ \
|
||||
conjx = bli_apply_conj( conjh, conjx ); \
|
||||
conjy = bli_apply_conj( conjh, conjy ); \
|
||||
\
|
||||
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
|
||||
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
|
||||
} \
|
||||
\
|
||||
/* Apply conjh (which carries the conjugation component of the Hermitian
|
||||
transpose, if applicable) to conjx and/or conjy as needed to arrive at
|
||||
the effective conjugation for the vector subproblems. */ \
|
||||
conj0 = bli_apply_conj( conjh, conjy ); \
|
||||
conj1 = bli_apply_conj( conjh, conjx ); \
|
||||
\
|
||||
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer. */ \
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
n_behind = i; \
|
||||
x0 = x + (0 )*incx; \
|
||||
chi1 = x + (i )*incx; \
|
||||
y0 = y + (0 )*incy; \
|
||||
psi1 = y + (i )*incy; \
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
|
||||
\
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
|
||||
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \
|
||||
PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \
|
||||
PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \
|
||||
\
|
||||
/* Compute scalars for vector subproblems. */ \
|
||||
PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
|
||||
PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
|
||||
\
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
|
||||
already been conjugated, if needed, by conjx and conjy. */ \
|
||||
PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
|
||||
\
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */ \
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
|
||||
kfp_2v \
|
||||
( \
|
||||
conj0, \
|
||||
conj1, \
|
||||
n_behind, \
|
||||
&alpha0_chi1, \
|
||||
&alpha1_psi1, \
|
||||
y0, incy, \
|
||||
x0, incx, \
|
||||
c10t, cs_ct, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
|
||||
+ conj(alpha) * psi1 * conj(chi1); */ \
|
||||
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
|
||||
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
|
||||
\
|
||||
/* For her2, explicitly set the imaginary component of gamma11 to
|
||||
zero. */ \
|
||||
if ( bli_is_conj( conjh ) ) \
|
||||
PASTEMAC(ch,seti0s)( *gamma11 ); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_trans_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var1
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* x0;
|
||||
double* chi1;
|
||||
double* y0;
|
||||
double* psi1;
|
||||
double* c10t;
|
||||
double* gamma11;
|
||||
double alpha0;
|
||||
double alpha1;
|
||||
double alpha0_chi1;
|
||||
double alpha1_psi1;
|
||||
double alpha0_chi1_psi1;
|
||||
double conjx0_chi1;
|
||||
double conjy1_psi1;
|
||||
double conjy0_psi1;
|
||||
dim_t i;
|
||||
dim_t n_behind;
|
||||
inc_t rs_ct, cs_ct;
|
||||
conj_t conj0, conj1;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case;the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
conjx = bli_apply_conj( conjh, conjx );
|
||||
conjy = bli_apply_conj( conjh, conjy );
|
||||
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
|
||||
PASTEMAC(d,copys)( *alpha, alpha1 );
|
||||
}
|
||||
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
conj0 = bli_apply_conj( conjh, conjy );
|
||||
conj1 = bli_apply_conj( conjh, conjx );
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
if((n_behind >= 3))
|
||||
{
|
||||
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
|
||||
i+=4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var1)
|
||||
GENTFUNC(scomplex, c, her2_unf_var1)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var1)
|
||||
|
||||
|
||||
@@ -166,192 +166,5 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var4
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* chi1;
|
||||
double* x2;
|
||||
double* psi1;
|
||||
double* y2;
|
||||
double* gamma11;
|
||||
double* c21;
|
||||
double alpha0;
|
||||
double alpha0_psi1;
|
||||
double alpha1_chi1;
|
||||
double alpha0_chi1_psi1;
|
||||
dim_t i;
|
||||
dim_t n_ahead;
|
||||
inc_t rs_ct, cs_ct;
|
||||
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case; the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if((incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
if((n_ahead >= 3))
|
||||
{
|
||||
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
|
||||
i+= 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i)
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var4)
|
||||
GENTFUNC(scomplex, c, her2_unf_var4)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var4)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
|
||||
#endif
|
||||
|
||||
|
||||
354
frame/2/her2/bli_her2_unf_var4_amd.c
Normal file
354
frame/2/her2/bli_her2_unf_var4_amd.c
Normal file
@@ -0,0 +1,354 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
uplo_t uplo, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
conj_t conjh, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* y, inc_t incy, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* chi1; \
|
||||
ctype* x2; \
|
||||
ctype* psi1; \
|
||||
ctype* y2; \
|
||||
ctype* gamma11; \
|
||||
ctype* c21; \
|
||||
ctype alpha0; \
|
||||
ctype alpha1; \
|
||||
ctype alpha0_psi1; \
|
||||
ctype alpha1_chi1; \
|
||||
ctype alpha0_chi1_psi1; \
|
||||
ctype conjy0_psi1; \
|
||||
ctype conjx1_chi1; \
|
||||
ctype conjx0_chi1; \
|
||||
dim_t i; \
|
||||
dim_t n_ahead; \
|
||||
inc_t rs_ct, cs_ct; \
|
||||
conj_t conj0, conj1; \
|
||||
conj_t conjh_conjx; \
|
||||
conj_t conjh_conjy; \
|
||||
\
|
||||
/* Eliminate unused variable warnings. */ \
|
||||
( void )conjh_conjx; \
|
||||
( void )conjh_conjy; \
|
||||
\
|
||||
/* The algorithm will be expressed in terms of the lower triangular case;
|
||||
the upper triangular case is supported by swapping the row and column
|
||||
strides of A and toggling some conj parameters. */ \
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
rs_ct = rs_c; \
|
||||
cs_ct = cs_c; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
|
||||
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
rs_ct = cs_c; \
|
||||
cs_ct = rs_c; \
|
||||
\
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
|
||||
as her2; for syr2, conjx/conjy are unchanged. */ \
|
||||
conjx = bli_apply_conj( conjh, conjx ); \
|
||||
conjy = bli_apply_conj( conjh, conjy ); \
|
||||
\
|
||||
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
|
||||
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
|
||||
} \
|
||||
\
|
||||
/* Apply conjh (which carries the conjugation component of the Hermitian
|
||||
transpose, if applicable) to conjx and/or conjy as needed to arrive at
|
||||
the effective conjugation for the vector subproblems. */ \
|
||||
conj0 = conjx; \
|
||||
conj1 = conjy; \
|
||||
conjh_conjx = bli_apply_conj( conjh, conjx ); \
|
||||
conjh_conjy = bli_apply_conj( conjh, conjy ); \
|
||||
\
|
||||
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer. */ \
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
n_ahead = m - i - 1; \
|
||||
chi1 = x + (i )*incx; \
|
||||
x2 = x + (i+1)*incx; \
|
||||
psi1 = y + (i )*incy; \
|
||||
y2 = y + (i+1)*incy; \
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
|
||||
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
|
||||
\
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
|
||||
PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
|
||||
PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
|
||||
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
|
||||
\
|
||||
/* Compute scalars for vector subproblems. */ \
|
||||
PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
|
||||
PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
|
||||
\
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
|
||||
already been conjugated, if needed, by conjx and conjy. */ \
|
||||
PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
|
||||
\
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */ \
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
|
||||
kfp_2v \
|
||||
( \
|
||||
conj0, \
|
||||
conj1, \
|
||||
n_ahead, \
|
||||
&alpha0_psi1, \
|
||||
&alpha1_chi1, \
|
||||
x2, incx, \
|
||||
y2, incy, \
|
||||
c21, rs_ct, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
|
||||
+ conj(alpha) * psi1 * conj(chi1); */ \
|
||||
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
|
||||
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
|
||||
\
|
||||
/* For her2, explicitly set the imaginary component of gamma11 to
|
||||
zero. */ \
|
||||
if ( bli_is_conj( conjh ) ) \
|
||||
PASTEMAC(ch,seti0s)( *gamma11 ); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var4
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* chi1;
|
||||
double* x2;
|
||||
double* psi1;
|
||||
double* y2;
|
||||
double* gamma11;
|
||||
double* c21;
|
||||
double alpha0;
|
||||
double alpha0_psi1;
|
||||
double alpha1_chi1;
|
||||
double alpha0_chi1_psi1;
|
||||
dim_t i;
|
||||
dim_t n_ahead;
|
||||
inc_t rs_ct, cs_ct;
|
||||
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case; the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if((incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
if((n_ahead >= 3))
|
||||
{
|
||||
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
|
||||
i+= 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i)
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var4)
|
||||
GENTFUNC(scomplex, c, her2_unf_var4)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var4)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -231,421 +231,4 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void bli_dtrsv_unf_var1
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* minus_one = PASTEMAC(d,m1);
|
||||
double* A10;
|
||||
double* A11;
|
||||
double* A12;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a12t;
|
||||
double* x0;
|
||||
double* x1;
|
||||
double* x2;
|
||||
double* x01;
|
||||
double* chi11;
|
||||
double* x21;
|
||||
double alpha11_conj;
|
||||
double rho1;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(d,dotxf_ker_ft) kfp_df;
|
||||
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_df = bli_ddotxf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
num_t dt = PASTEMAC(d,type);
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_behind = iter;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A12 = a + (i )*rs_at + (i+f)*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 - A12 * x2; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A12, cs_at, rs_at,
|
||||
x2, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_behind = k;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
PASTEMAC(d,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(d,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_behind = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 - A10 * x0; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_behind = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a10t = A11 + (l )*rs_at + (0 )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 - a10t * x01; */
|
||||
PASTEMAC(d,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(d,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_strsv_unf_var1
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* one = PASTEMAC(s,1);
|
||||
float* minus_one = PASTEMAC(s,m1);
|
||||
float* A10;
|
||||
float* A11;
|
||||
float* A12;
|
||||
float* a10t;
|
||||
float* alpha11;
|
||||
float* a12t;
|
||||
float* x0;
|
||||
float* x1;
|
||||
float* x2;
|
||||
float* x01;
|
||||
float* chi11;
|
||||
float* x21;
|
||||
float alpha11_conj;
|
||||
float rho1;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(s,dotxf_ker_ft) kfp_df;
|
||||
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_df = bli_sdotxf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
num_t dt = PASTEMAC(s,type);
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_behind = iter;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A12 = a + (i )*rs_at + (i+f)*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 - A12 * x2; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A12, cs_at, rs_at,
|
||||
x2, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_behind = k;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
PASTEMAC(s,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(s,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_behind = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 - A10 * x0; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_behind = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a10t = A11 + (l )*rs_at + (0 )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 - a10t * x01; */
|
||||
PASTEMAC(s,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(s,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( trsv_unf_var1 )
|
||||
#endif
|
||||
|
||||
638
frame/2/trsv/bli_trsv_unf_var1_amd.c
Normal file
638
frame/2/trsv/bli_trsv_unf_var1_amd.c
Normal file
@@ -0,0 +1,638 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
uplo_t uploa, \
|
||||
trans_t transa, \
|
||||
diag_t diaga, \
|
||||
dim_t m, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* A10; \
|
||||
ctype* A11; \
|
||||
ctype* A12; \
|
||||
ctype* a10t; \
|
||||
ctype* alpha11; \
|
||||
ctype* a12t; \
|
||||
ctype* x0; \
|
||||
ctype* x1; \
|
||||
ctype* x2; \
|
||||
ctype* x01; \
|
||||
ctype* chi11; \
|
||||
ctype* x21; \
|
||||
ctype alpha11_conj; \
|
||||
ctype rho1; \
|
||||
dim_t iter, i, k, j, l; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_behind, f_behind; \
|
||||
inc_t rs_at, cs_at; \
|
||||
uplo_t uploa_trans; \
|
||||
conj_t conja; \
|
||||
\
|
||||
/* x = alpha * x; */ \
|
||||
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
if ( bli_does_notrans( transa ) ) \
|
||||
{ \
|
||||
rs_at = rs_a; \
|
||||
cs_at = cs_a; \
|
||||
uploa_trans = uploa; \
|
||||
} \
|
||||
else /* if ( bli_does_trans( transa ) ) */ \
|
||||
{ \
|
||||
rs_at = cs_a; \
|
||||
cs_at = rs_a; \
|
||||
uploa_trans = bli_uplo_toggled( uploa ); \
|
||||
} \
|
||||
\
|
||||
conja = bli_extract_conj( transa ); \
|
||||
\
|
||||
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
||||
\
|
||||
/* We reduce all of the possible cases down to just lower/upper. */ \
|
||||
if ( bli_is_upper( uploa_trans ) ) \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
|
||||
i = m - iter - f; \
|
||||
n_behind = iter; \
|
||||
A11 = a + (i )*rs_at + (i )*cs_at; \
|
||||
A12 = a + (i )*rs_at + (i+f)*cs_at; \
|
||||
x1 = x + (i )*incx; \
|
||||
x2 = x + (i+f)*incx; \
|
||||
\
|
||||
/* x1 = x1 - A12 * x2; */ \
|
||||
kfp_df \
|
||||
( \
|
||||
conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
f, \
|
||||
minus_one, \
|
||||
A12, cs_at, rs_at, \
|
||||
x2, incx, \
|
||||
one, \
|
||||
x1, incx, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* x1 = x1 / triu( A11 ); */ \
|
||||
for ( k = 0; k < f; ++k ) \
|
||||
{ \
|
||||
l = f - k - 1; \
|
||||
f_behind = k; \
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
|
||||
a12t = A11 + (l )*rs_at + (l+1)*cs_at; \
|
||||
chi11 = x1 + (l )*incx; \
|
||||
x21 = x1 + (l+1)*incx; \
|
||||
\
|
||||
/* chi11 = chi11 - a12t * x21; */ \
|
||||
PASTEMAC(ch,set0s)( rho1 ); \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho1, *chi11 ); \
|
||||
\
|
||||
/* chi11 = chi11 / alpha11; */ \
|
||||
if ( bli_is_nonunit_diag( diaga ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
|
||||
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */ \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
A11 = a + (i )*rs_at + (i )*cs_at; \
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x + (i )*incx; \
|
||||
x0 = x + (0 )*incx; \
|
||||
\
|
||||
/* x1 = x1 - A10 * x0; */ \
|
||||
kfp_df \
|
||||
( \
|
||||
conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
f, \
|
||||
minus_one, \
|
||||
A10, cs_at, rs_at, \
|
||||
x0, incx, \
|
||||
one, \
|
||||
x1, incx, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* x1 = x1 / tril( A11 ); */ \
|
||||
for ( k = 0; k < f; ++k ) \
|
||||
{ \
|
||||
l = k; \
|
||||
f_behind = l; \
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
|
||||
a10t = A11 + (l )*rs_at + (0 )*cs_at; \
|
||||
chi11 = x1 + (l )*incx; \
|
||||
x01 = x1 + (0 )*incx; \
|
||||
\
|
||||
/* chi11 = chi11 - a10t * x01; */ \
|
||||
PASTEMAC(ch,set0s)( rho1 ); \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( j = 0; j < f_behind; ++j ) \
|
||||
PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho1, *chi11 ); \
|
||||
\
|
||||
/* chi11 = chi11 / alpha11; */ \
|
||||
if ( bli_is_nonunit_diag( diaga ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
|
||||
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
void bli_dtrsv_unf_var1
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* one = PASTEMAC(d,1);
|
||||
double* minus_one = PASTEMAC(d,m1);
|
||||
double* A10;
|
||||
double* A11;
|
||||
double* A12;
|
||||
double* a10t;
|
||||
double* alpha11;
|
||||
double* a12t;
|
||||
double* x0;
|
||||
double* x1;
|
||||
double* x2;
|
||||
double* x01;
|
||||
double* chi11;
|
||||
double* x21;
|
||||
double alpha11_conj;
|
||||
double rho1;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(d,dotxf_ker_ft) kfp_df;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE) {
|
||||
kfp_df = bli_ddotxf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
num_t dt = PASTEMAC(d,type);
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_behind = iter;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A12 = a + (i )*rs_at + (i+f)*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 - A12 * x2; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A12, cs_at, rs_at,
|
||||
x2, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_behind = k;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
PASTEMAC(d,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(d,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_behind = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 - A10 * x0; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_behind = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a10t = A11 + (l )*rs_at + (0 )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 - a10t * x01; */
|
||||
PASTEMAC(d,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(d,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_strsv_unf_var1
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* one = PASTEMAC(s,1);
|
||||
float* minus_one = PASTEMAC(s,m1);
|
||||
float* A10;
|
||||
float* A11;
|
||||
float* A12;
|
||||
float* a10t;
|
||||
float* alpha11;
|
||||
float* a12t;
|
||||
float* x0;
|
||||
float* x1;
|
||||
float* x2;
|
||||
float* x01;
|
||||
float* chi11;
|
||||
float* x21;
|
||||
float alpha11_conj;
|
||||
float rho1;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_behind, f_behind;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(s,dotxf_ker_ft) kfp_df;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE) {
|
||||
kfp_df = bli_sdotxf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
num_t dt = PASTEMAC(s,type);
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
|
||||
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_behind = iter;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A12 = a + (i )*rs_at + (i+f)*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 - A12 * x2; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A12, cs_at, rs_at,
|
||||
x2, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_behind = k;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a12t = A11 + (l )*rs_at + (l+1)*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
PASTEMAC(s,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(s,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_behind = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A10 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 - A10 * x0; */
|
||||
kfp_df
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_behind,
|
||||
f,
|
||||
minus_one,
|
||||
A10, cs_at, rs_at,
|
||||
x0, incx,
|
||||
one,
|
||||
x1, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_behind = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a10t = A11 + (l )*rs_at + (0 )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 - a10t * x01; */
|
||||
PASTEMAC(s,set0s)( rho1 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_behind; ++j )
|
||||
PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
|
||||
}
|
||||
PASTEMAC(s,subs)( rho1, *chi11 );
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -228,805 +228,5 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void bli_dtrsv_unf_var2
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* minus_one = PASTEMAC(d,m1);
|
||||
double* A01;
|
||||
double* A11;
|
||||
double* A21;
|
||||
double* a01;
|
||||
double* alpha11;
|
||||
double* a21;
|
||||
double* x0;
|
||||
double* x1;
|
||||
double* x2;
|
||||
double* x01;
|
||||
double* chi11;
|
||||
double* x21;
|
||||
double alpha11_conj;
|
||||
double minus_chi11;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead, f_ahead;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if ( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(d,axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_af = bli_daxpyf_zen_int_16x4;
|
||||
b_fuse = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx );
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_ahead = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A01 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_ahead = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a01 = A11 + (0 )*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x01 = x01 - chi11 * a01; */
|
||||
PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(d,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(d,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x0 = x0 - A01 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A01, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x0, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_ahead = m - iter - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_ahead = f - k - 1;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x21 = x21 - chi11 * a21; */
|
||||
PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(d,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(d,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x2 = x2 - A21 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A21, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x2, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_strsv_unf_var2
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
float* alpha,
|
||||
float* a, inc_t rs_a, inc_t cs_a,
|
||||
float* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* minus_one = PASTEMAC(s, m1);
|
||||
float* A01;
|
||||
float* A11;
|
||||
float* A21;
|
||||
float* a01;
|
||||
float* alpha11;
|
||||
float* a21;
|
||||
float* x0;
|
||||
float* x1;
|
||||
float* x2;
|
||||
float* x01;
|
||||
float* chi11;
|
||||
float* x21;
|
||||
float alpha11_conj;
|
||||
float minus_chi11;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead, f_ahead;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(s, scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(s, axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Assign function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_af = bli_saxpyf_zen_int_5;
|
||||
b_fuse = 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_FLOAT, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_AF, cntx );
|
||||
}
|
||||
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_ahead = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A01 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_ahead = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a01 = A11 + (0 )*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x01 = x01 - chi11 * a01; */
|
||||
PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(s, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(s, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x0 = x0 - A01 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A01, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x0, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_ahead = m - iter - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_ahead = f - k - 1;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x21 = x21 - chi11 * a21; */
|
||||
PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(s, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(s, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x2 = x2 - A21 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A21, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x2, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_ztrsv_unf_var2
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
dcomplex* alpha,
|
||||
dcomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
dcomplex* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
dcomplex* minus_one = PASTEMAC(z, m1);
|
||||
dcomplex* A01;
|
||||
dcomplex* A11;
|
||||
dcomplex* A21;
|
||||
dcomplex* a01;
|
||||
dcomplex* alpha11;
|
||||
dcomplex* a21;
|
||||
dcomplex* x0;
|
||||
dcomplex* x1;
|
||||
dcomplex* x2;
|
||||
dcomplex* x01;
|
||||
dcomplex* chi11;
|
||||
dcomplex* x21;
|
||||
dcomplex alpha11_conj;
|
||||
dcomplex minus_chi11;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead, f_ahead;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(z, scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(z, axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Assign function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_af = bli_zaxpyf_zen_int_5;
|
||||
b_fuse = 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_AF, cntx );
|
||||
}
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_ahead = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A01 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_ahead = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a01 = A11 + (0 )*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x01 = x01 - chi11 * a01; */
|
||||
PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(z, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(z, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x0 = x0 - A01 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A01, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x0, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_ahead = m - iter - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_ahead = f - k - 1;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x21 = x21 - chi11 * a21; */
|
||||
PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(z, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(z, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x2 = x2 - A21 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A21, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x2, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_ctrsv_unf_var2
|
||||
(
|
||||
uplo_t uploa,
|
||||
trans_t transa,
|
||||
diag_t diaga,
|
||||
dim_t m,
|
||||
scomplex* alpha,
|
||||
scomplex* a, inc_t rs_a, inc_t cs_a,
|
||||
scomplex* x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
scomplex* minus_one = PASTEMAC(c, m1);
|
||||
scomplex* A01;
|
||||
scomplex* A11;
|
||||
scomplex* A21;
|
||||
scomplex* a01;
|
||||
scomplex* alpha11;
|
||||
scomplex* a21;
|
||||
scomplex* x0;
|
||||
scomplex* x1;
|
||||
scomplex* x2;
|
||||
scomplex* x01;
|
||||
scomplex* chi11;
|
||||
scomplex* x21;
|
||||
scomplex alpha11_conj;
|
||||
scomplex minus_chi11;
|
||||
dim_t iter, i, k, j, l;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_ahead, f_ahead;
|
||||
inc_t rs_at, cs_at;
|
||||
uplo_t uploa_trans;
|
||||
conj_t conja;
|
||||
|
||||
/* x = alpha * x; */
|
||||
PASTEMAC2(c, scalv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m,
|
||||
alpha,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_does_notrans( transa ) )
|
||||
{
|
||||
rs_at = rs_a;
|
||||
cs_at = cs_a;
|
||||
uploa_trans = uploa;
|
||||
}
|
||||
else /* if ( bli_does_trans( transa ) ) */
|
||||
{
|
||||
rs_at = cs_a;
|
||||
cs_at = rs_a;
|
||||
uploa_trans = bli_uplo_toggled( uploa );
|
||||
}
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
PASTECH(c, axpyf_ker_ft) kfp_af;
|
||||
|
||||
/* Assign function pointer and fusing factor. */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
kfp_af = bli_caxpyf_zen_int_5;
|
||||
b_fuse = 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYF_KER, cntx );
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_AF, cntx );
|
||||
}
|
||||
/* We reduce all of the possible cases down to just lower/upper. */
|
||||
if ( bli_is_upper( uploa_trans ) )
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse );
|
||||
i = m - iter - f;
|
||||
n_ahead = i;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A01 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x0 = x + (0 )*incx;
|
||||
|
||||
/* x1 = x1 / triu( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = f - k - 1;
|
||||
f_ahead = l;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a01 = A11 + (0 )*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x01 = x1 + (0 )*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x01 = x01 - chi11 * a01; */
|
||||
PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(c, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(c, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x0 = x0 - A01 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A01, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x0, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
else /* if ( bli_is_lower( uploa_trans ) ) */
|
||||
{
|
||||
for ( iter = 0; iter < m; iter += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse );
|
||||
i = iter;
|
||||
n_ahead = m - iter - f;
|
||||
A11 = a + (i )*rs_at + (i )*cs_at;
|
||||
A21 = a + (i+f)*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
x2 = x + (i+f)*incx;
|
||||
|
||||
/* x1 = x1 / tril( A11 ); */
|
||||
for ( k = 0; k < f; ++k )
|
||||
{
|
||||
l = k;
|
||||
f_ahead = f - k - 1;
|
||||
alpha11 = A11 + (l )*rs_at + (l )*cs_at;
|
||||
a21 = A11 + (l+1)*rs_at + (l )*cs_at;
|
||||
chi11 = x1 + (l )*incx;
|
||||
x21 = x1 + (l+1)*incx;
|
||||
|
||||
/* chi11 = chi11 / alpha11; */
|
||||
if ( bli_is_nonunit_diag( diaga ) )
|
||||
{
|
||||
PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
|
||||
PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
|
||||
}
|
||||
|
||||
/* x21 = x21 - chi11 * a21; */
|
||||
PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
|
||||
if ( bli_is_conj( conja ) )
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(c, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( j = 0; j < f_ahead; ++j )
|
||||
PASTEMAC(c, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
|
||||
}
|
||||
}
|
||||
|
||||
/* x2 = x2 - A21 * x1; */
|
||||
kfp_af
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_ahead,
|
||||
f,
|
||||
minus_one,
|
||||
A21, rs_at, cs_at,
|
||||
x1, incx,
|
||||
x2, incx,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
|
||||
#endif
|
||||
INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
|
||||
1024
frame/2/trsv/bli_trsv_unf_var2_amd.c
Normal file
1024
frame/2/trsv/bli_trsv_unf_var2_amd.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -48,120 +48,6 @@ err_t bli_gemmsup_int
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
//bli_gemmsup_ref_var2
|
||||
//bli_gemmsup_ref_var1
|
||||
#if 0
|
||||
bli_gemmsup_ref_var1n
|
||||
#else
|
||||
#endif
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
if ( is_rrr_rrc_rcr_crr )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
|
||||
// calculate number of micropanels in m and n dimensions and
|
||||
// recalculate the automatic thread factorization based on these number of micropanels
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
/*Enable packing for B matrix for higher sizes*/
|
||||
if(bli_is_float(dt) && (n_threads==1)) {
|
||||
if((m > 240) && (k > 240) && (n > 240))
|
||||
bli_rntm_set_pack_b( 1, rntm );
|
||||
}
|
||||
|
||||
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
if ( auto_factor )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
/* Enable packing for B matrix for higher sizes. Note that pack A
|
||||
* becomes pack B inside var2m because this is transpose case*/
|
||||
if(bli_is_float(dt) && (n_threads==1)) {
|
||||
if((m > 240) && (k > 240) && (n > 240))
|
||||
bli_rntm_set_pack_a( 1, rntm );
|
||||
}
|
||||
|
||||
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#else // #ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
@@ -335,8 +221,6 @@ err_t bli_gemmsup_int
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -401,15 +285,9 @@ err_t bli_gemmtsup_int
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
|
||||
|
||||
#else
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
#endif
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
@@ -472,14 +350,10 @@ err_t bli_gemmtsup_int
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
|
||||
#else
|
||||
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
#endif
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
|
||||
352
frame/3/bli_l3_sup_int_amd.c
Normal file
352
frame/3/bli_l3_sup_int_amd.c
Normal file
@@ -0,0 +1,352 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
err_t bli_gemmsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
//bli_gemmsup_ref_var2
|
||||
//bli_gemmsup_ref_var1
|
||||
#if 0
|
||||
bli_gemmsup_ref_var1n
|
||||
#else
|
||||
#endif
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
if ( is_rrr_rrc_rcr_crr )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
|
||||
// calculate number of micropanels in m and n dimensions and
|
||||
// recalculate the automatic thread factorization based on these number of micropanels
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
/*Enable packing for B matrix for higher sizes*/
|
||||
if(bli_is_float(dt) && (n_threads==1)) {
|
||||
if((m > 240) && (k > 240) && (n > 240))
|
||||
bli_rntm_set_pack_b( 1, rntm );
|
||||
}
|
||||
|
||||
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
// - Currently only row-preferential kernels are only supported.
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
if ( auto_factor )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
/* Enable packing for B matrix for higher sizes. Note that pack A
|
||||
* becomes pack B inside var2m because this is transpose case*/
|
||||
if(bli_is_float(dt) && (n_threads==1)) {
|
||||
if((m > 240) && (k > 240) && (n > 240))
|
||||
bli_rntm_set_pack_a( 1, rntm );
|
||||
}
|
||||
|
||||
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
err_t bli_gemmtsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
|
||||
// AOCL_DTL_LOG_GEMMT_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
|
||||
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = m;
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
bool use_bp = TRUE;
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of nc up to be a multiple of mr.
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of mc up to be a multiple of nr.
|
||||
}
|
||||
}
|
||||
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -177,19 +177,6 @@ void bli_gemm_front
|
||||
dim_t m_dim_local = bli_obj_length( &c_local );
|
||||
dim_t n_dim_local = bli_obj_width( &c_local );
|
||||
dim_t k_dim_local = bli_obj_width( &a_local );
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
// Regression observed in sgemm native path in cases where m >= 4 * n
|
||||
// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit
|
||||
// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
|
||||
// the issue.
|
||||
if( bli_obj_is_float( &c_local ) &&
|
||||
( n_dim_local >= 1024 ) &&
|
||||
( k_dim_local >= 1024 ) &&
|
||||
( m_dim_local >= ( 4 * n_dim_local ) ) )
|
||||
{
|
||||
m_dim_local *= 2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
|
||||
413
frame/3/gemm/bli_gemm_front_amd.c
Normal file
413
frame/3/gemm/bli_gemm_front_amd.c
Normal file
@@ -0,0 +1,413 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
bli_init_once();
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
|
||||
// and return early.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
|
||||
bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
// Only handle small problems separately for homogeneous datatypes.
|
||||
if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
|
||||
bli_obj_dt( a ) == bli_obj_dt( c ) &&
|
||||
bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
|
||||
{
|
||||
err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
|
||||
|
||||
if ( status == BLIS_SUCCESS )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
cntx_t cntx_local;
|
||||
|
||||
// If any of the storage datatypes differ, or if the computation precision
|
||||
// differs from the storage precision of C, utilize the mixed datatype
|
||||
// code path.
|
||||
// NOTE: If we ever want to support the caller setting the computation
|
||||
// domain explicitly, we will need to check the computation dt against the
|
||||
// storage dt of C (instead of the computation precision against the
|
||||
// storage precision of C).
|
||||
if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
|
||||
bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
|
||||
bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
|
||||
{
|
||||
// Handle mixed datatype cases in bli_gemm_md(), which may modify
|
||||
// the objects or the context. (If the context is modified, cntx
|
||||
// is adjusted to point to cntx_local.)
|
||||
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
|
||||
}
|
||||
//else // homogeneous datatypes
|
||||
#endif
|
||||
|
||||
// Load the pack schemas from the context and embed them into the objects
|
||||
// for A and B. (Native contexts are initialized with the correct pack
|
||||
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
|
||||
// have made a copy and modified the schemas, so reading them from the
|
||||
// context should be a safe bet at this point.) This is a sort of hack for
|
||||
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
|
||||
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
|
||||
// to subsequently access the schemas from the control tree, which
|
||||
// hopefully reduces some confusion, particularly in bli_packm_init().
|
||||
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Next, we handle the possibility of needing to typecast alpha to the
|
||||
// computation datatype and/or beta to the storage datatype of C.
|
||||
|
||||
// Attach alpha to B, and in the process typecast alpha to the target
|
||||
// datatype of the matrix (which in this case is equal to the computation
|
||||
// datatype).
|
||||
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
|
||||
|
||||
// Attach beta to C, and in the process typecast beta to the target
|
||||
// datatype of the matrix (which in this case is equal to the storage
|
||||
// datatype of C).
|
||||
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local );
|
||||
|
||||
// Change the alpha and beta pointers to BLIS_ONE since the values have
|
||||
// now been typecast and attached to the matrices above.
|
||||
alpha = &BLIS_ONE;
|
||||
beta = &BLIS_ONE;
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// Don't perform the following optimization for ccr or crc cases, as
|
||||
// those cases are sensitive to the ukernel storage preference (ie:
|
||||
// transposing the operation would break them).
|
||||
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
|
||||
#endif
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
|
||||
// We must also swap the pack schemas, which were set by bli_gemm_md()
|
||||
// or the inlined code above.
|
||||
bli_obj_swap_pack_schemas( &a_local, &b_local );
|
||||
}
|
||||
|
||||
dim_t m_dim_local = bli_obj_length( &c_local );
|
||||
dim_t n_dim_local = bli_obj_width( &c_local );
|
||||
dim_t k_dim_local = bli_obj_width( &a_local );
|
||||
|
||||
// Regression observed in sgemm native path in cases where m >= 4 * n
|
||||
// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit
|
||||
// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
|
||||
// the issue.
|
||||
if( bli_obj_is_float( &c_local ) &&
|
||||
( n_dim_local >= 1024 ) &&
|
||||
( k_dim_local >= 1024 ) &&
|
||||
( m_dim_local >= ( 4 * n_dim_local ) ) )
|
||||
{
|
||||
m_dim_local *= 2;
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
m_dim_local,
|
||||
n_dim_local,
|
||||
k_dim_local,
|
||||
rntm
|
||||
);
|
||||
|
||||
obj_t* cp = &c_local;
|
||||
obj_t* betap = beta;
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If any of the following conditions are met, create a temporary matrix
|
||||
// conformal to C into which we will accumulate the matrix product:
|
||||
// - the storage precision of C differs from the computation precision;
|
||||
// - the domains are mixed as crr;
|
||||
// - the storage format of C does not match the preferred orientation
|
||||
// of the ccr or crc cases.
|
||||
// Then, after the computation is complete, this matrix will be copied
|
||||
// or accumulated back to C.
|
||||
const bool is_ccr_mismatch =
|
||||
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_col_stored( &c_local ) );
|
||||
const bool is_crc_mismatch =
|
||||
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_row_stored( &c_local ) );
|
||||
|
||||
obj_t ct;
|
||||
bool use_ct = FALSE;
|
||||
|
||||
// FGVZ: Consider adding another guard here that only creates and uses a
|
||||
// temporary matrix for accumulation if k < c * kc, where c is some small
|
||||
// constant like 2. And don't forget to use the same conditional for the
|
||||
// castm() and free() at the end.
|
||||
if (
|
||||
bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
|
||||
bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
|
||||
is_ccr_mismatch ||
|
||||
is_crc_mismatch
|
||||
)
|
||||
{
|
||||
use_ct = TRUE;
|
||||
}
|
||||
|
||||
// If we need a temporary matrix conformal to C for whatever reason,
|
||||
// we create it and prepare to use it now.
|
||||
if ( use_ct )
|
||||
{
|
||||
const dim_t m = bli_obj_length( &c_local );
|
||||
const dim_t n = bli_obj_width( &c_local );
|
||||
inc_t rs = bli_obj_row_stride( &c_local );
|
||||
inc_t cs = bli_obj_col_stride( &c_local );
|
||||
|
||||
num_t dt_ct = bli_obj_domain( &c_local ) |
|
||||
bli_obj_comp_prec( &c_local );
|
||||
|
||||
// When performing the crr case, accumulate to a contiguously-stored
|
||||
// real matrix so we do not have to repeatedly update C with general
|
||||
// stride.
|
||||
if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
|
||||
dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
|
||||
|
||||
// When performing the mismatched ccr or crc cases, now is the time
|
||||
// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
|
||||
// microkernel can output directly to C (instead of using a temporary
|
||||
// microtile).
|
||||
if ( is_ccr_mismatch ) { rs = 1; cs = m; }
|
||||
else if ( is_crc_mismatch ) { rs = n; cs = 1; }
|
||||
|
||||
bli_obj_create( dt_ct, m, n, rs, cs, &ct );
|
||||
|
||||
const num_t dt_exec = bli_obj_exec_dt( &c_local );
|
||||
const num_t dt_comp = bli_obj_comp_dt( &c_local );
|
||||
|
||||
bli_obj_set_target_dt( dt_ct, &ct );
|
||||
bli_obj_set_exec_dt( dt_exec, &ct );
|
||||
bli_obj_set_comp_dt( dt_comp, &ct );
|
||||
|
||||
// A naive approach would cast C to the comptuation datatype,
|
||||
// compute with beta, and then cast the result back to the
|
||||
// user-provided output matrix. However, we employ a different
|
||||
// approach that halves the number of memops on C (or its
|
||||
// typecast temporary) by writing the A*B product directly to
|
||||
// temporary storage, and then using xpbym to scale the
|
||||
// output matrix by beta and accumulate/cast the A*B product.
|
||||
//bli_castm( &c_local, &ct );
|
||||
betap = &BLIS_ZERO;
|
||||
|
||||
cp = &ct;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
betap,
|
||||
cp,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If we created a temporary matrix conformal to C for whatever reason,
|
||||
// we copy/accumulate the result back to C and then release the object.
|
||||
if ( use_ct )
|
||||
{
|
||||
obj_t beta_local;
|
||||
|
||||
bli_obj_scalar_detach( &c_local, &beta_local );
|
||||
|
||||
//bli_castnzm( &ct, &c_local );
|
||||
bli_xpbym( &ct, &beta_local, &c_local );
|
||||
|
||||
bli_obj_free( &ct );
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
|
||||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
const bool a_is_real = bli_obj_is_real( a );
|
||||
const bool a_is_comp = bli_obj_is_complex( a );
|
||||
const bool b_is_real = bli_obj_is_real( b );
|
||||
const bool b_is_comp = bli_obj_is_complex( b );
|
||||
const bool c_is_real = bli_obj_is_real( c );
|
||||
const bool c_is_comp = bli_obj_is_complex( c );
|
||||
|
||||
const bool a_is_single = bli_obj_is_single_prec( a );
|
||||
const bool a_is_double = bli_obj_is_double_prec( a );
|
||||
const bool b_is_single = bli_obj_is_single_prec( b );
|
||||
const bool b_is_double = bli_obj_is_double_prec( b );
|
||||
const bool c_is_single = bli_obj_is_single_prec( c );
|
||||
const bool c_is_double = bli_obj_is_double_prec( c );
|
||||
|
||||
const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
|
||||
const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
|
||||
|
||||
const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
|
||||
bli_obj_domain( c ) != bli_obj_domain( b );
|
||||
|
||||
( void )a_is_real; ( void )a_is_comp;
|
||||
( void )b_is_real; ( void )b_is_comp;
|
||||
( void )c_is_real; ( void )c_is_comp;
|
||||
( void )a_is_single; ( void )a_is_double;
|
||||
( void )b_is_single; ( void )b_is_double;
|
||||
( void )c_is_single; ( void )c_is_double;
|
||||
( void )comp_single; ( void )comp_double;
|
||||
|
||||
if (
|
||||
//( c_is_comp && a_is_comp && b_is_real ) ||
|
||||
//( c_is_comp && a_is_real && b_is_comp ) ||
|
||||
//( c_is_real && a_is_comp && b_is_comp ) ||
|
||||
//( c_is_comp && a_is_real && b_is_real ) ||
|
||||
//( c_is_real && a_is_comp && b_is_real ) ||
|
||||
//( c_is_real && a_is_real && b_is_comp ) ||
|
||||
//FALSE
|
||||
TRUE
|
||||
)
|
||||
{
|
||||
if (
|
||||
( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
|
||||
( c_is_single && a_is_single && b_is_single && comp_single ) ||
|
||||
( c_is_single && a_is_single && b_is_single && comp_double ) ||
|
||||
( c_is_single && a_is_single && b_is_double ) ||
|
||||
( c_is_single && a_is_double && b_is_single ) ||
|
||||
( c_is_double && a_is_single && b_is_single ) ||
|
||||
( c_is_single && a_is_double && b_is_double ) ||
|
||||
( c_is_double && a_is_single && b_is_double ) ||
|
||||
( c_is_double && a_is_double && b_is_single ) ||
|
||||
( c_is_double && a_is_double && b_is_double && comp_single ) ||
|
||||
( c_is_double && a_is_double && b_is_double && comp_double ) ||
|
||||
( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
|
||||
FALSE
|
||||
)
|
||||
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
|
||||
else
|
||||
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
|
||||
}
|
||||
else
|
||||
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
|
||||
return;
|
||||
}
|
||||
#else
|
||||
#if 0
|
||||
// If any of the storage datatypes differ, or if the execution precision
|
||||
// differs from the storage precision of C, utilize the mixed datatype
|
||||
// code path.
|
||||
// NOTE: We could check the exec dt against the storage dt of C, but for
|
||||
// now we don't support the caller setting the execution domain
|
||||
// explicitly.
|
||||
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
|
||||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -501,6 +501,25 @@ bool bli_cpuid_is_bulldozer
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_avx_supported( void )
|
||||
{
|
||||
uint32_t family, model, features;
|
||||
|
||||
// Call the CPUID instruction and parse its results into a family id,
|
||||
// model id, and a feature bit field. The return value encodes the
|
||||
// vendor.
|
||||
bli_cpuid_query( &family, &model, &features );
|
||||
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_AVX |
|
||||
FEATURE_FMA3 |
|
||||
FEATURE_AVX2;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
arch_t bli_cpuid_query_id( void )
|
||||
|
||||
@@ -133,7 +133,7 @@ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want )
|
||||
|
||||
void get_cpu_name( char *cpu_name );
|
||||
int vpu_count( void );
|
||||
|
||||
bool bli_cpuid_is_avx_supported(void);
|
||||
|
||||
enum
|
||||
{
|
||||
@@ -160,6 +160,8 @@ enum
|
||||
FEATURE_AVX512VL = 0x4000
|
||||
};
|
||||
|
||||
|
||||
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath );
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -98,217 +98,5 @@ f77_int PASTEF772(i,chx,blasname) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
f77_int isamax_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
|
||||
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
inc_t incx0;
|
||||
gint_t bli_index;
|
||||
f77_int f77_index;
|
||||
|
||||
/* If the vector is empty, return an index of zero. This early check
|
||||
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
|
||||
return 0, which ends up getting incremented to 1 (below) before
|
||||
being returned, which is not what we want. */
|
||||
if ( *n < 1 || *incx <= 0 ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_samaxv_zen_int
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
|
||||
index. Also, if the BLAS integer size differs from the BLIS
|
||||
integer size, that typecast occurs here. */
|
||||
f77_index = bli_index + 1;
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return f77_index;
|
||||
}
|
||||
|
||||
f77_int idamax_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
|
||||
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
inc_t incx0;
|
||||
gint_t bli_index;
|
||||
f77_int f77_index;
|
||||
|
||||
/* If the vector is empty, return an index of zero. This early check
|
||||
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
|
||||
return 0, which ends up getting incremented to 1 (below) before
|
||||
being returned, which is not what we want. */
|
||||
if ( *n < 1 || *incx <= 0 ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_damaxv_zen_int
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
|
||||
index. Also, if the BLAS integer size differs from the BLIS
|
||||
integer size, that typecast occurs here. */
|
||||
f77_index = bli_index + 1;
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return f77_index;
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( amax, amaxv )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
295
frame/compat/bla_amax_amd.c
Normal file
295
frame/compat/bla_amax_amd.c
Normal file
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype_x, chx, blasname, blisname ) \
|
||||
\
|
||||
f77_int PASTEF772(i,chx,blasname) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
const ftype_x* x, const f77_int* incx \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(chx), *n, *incx) \
|
||||
\
|
||||
dim_t n0; \
|
||||
ftype_x* x0; \
|
||||
inc_t incx0; \
|
||||
gint_t bli_index; \
|
||||
f77_int f77_index; \
|
||||
\
|
||||
/* If the vector is empty, return an index of zero. This early check
|
||||
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
|
||||
return 0, which ends up getting incremented to 1 (below) before
|
||||
being returned, which is not what we want. */ \
|
||||
if ( *n < 1 || *incx <= 0 ) { \
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "iamax_: vector empty") \
|
||||
return 0; \
|
||||
}\
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
&bli_index, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
|
||||
index. Also, if the BLAS integer size differs from the BLIS
|
||||
integer size, that typecast occurs here. */ \
|
||||
f77_index = bli_index + 1; \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
return f77_index; \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
f77_int isamax_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
|
||||
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
inc_t incx0;
|
||||
gint_t bli_index;
|
||||
f77_int f77_index;
|
||||
|
||||
/* If the vector is empty, return an index of zero. This early check
|
||||
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
|
||||
return 0, which ends up getting incremented to 1 (below) before
|
||||
being returned, which is not what we want. */
|
||||
if ( *n < 1 || *incx <= 0 ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_samaxv_zen_int
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
|
||||
index. Also, if the BLAS integer size differs from the BLIS
|
||||
integer size, that typecast occurs here. */
|
||||
f77_index = bli_index + 1;
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return f77_index;
|
||||
}
|
||||
|
||||
f77_int idamax_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
|
||||
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
inc_t incx0;
|
||||
gint_t bli_index;
|
||||
f77_int f77_index;
|
||||
|
||||
/* If the vector is empty, return an index of zero. This early check
|
||||
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
|
||||
return 0, which ends up getting incremented to 1 (below) before
|
||||
being returned, which is not what we want. */
|
||||
if ( *n < 1 || *incx <= 0 ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_damaxv_zen_int
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
&bli_index,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
|
||||
index. Also, if the BLAS integer size differs from the BLIS
|
||||
integer size, that typecast occurs here. */
|
||||
f77_index = bli_index + 1;
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return f77_index;
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -87,411 +87,6 @@ void PASTEF77(ch,blasname) \
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void saxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
const float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_saxpyv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
void daxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
const double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void caxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* alpha,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_caxpyv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void zaxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_zaxpyv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( axpy, axpyv )
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
462
frame/compat/bla_axpy_amd.c
Normal file
462
frame/compat/bla_axpy_amd.c
Normal file
@@ -0,0 +1,462 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
const ftype* alpha, \
|
||||
const ftype* x, const f77_int* incx, \
|
||||
ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, (void*)alpha, *incx, *incy) \
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
(ftype*)alpha, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
void saxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
const float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_saxpyv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
void daxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
const double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void caxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* alpha,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_caxpyv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void zaxpy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_zaxpyv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -88,217 +88,5 @@ void PASTEF77(ch,blasname) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void scopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (float*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (float*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_scopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void dcopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (double*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (double*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_dcopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS(copy, copyv)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
285
frame/compat/bla_copy_amd.c
Normal file
285
frame/compat/bla_copy_amd.c
Normal file
@@ -0,0 +1,285 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
const ftype* x, const f77_int* incx, \
|
||||
ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
\
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy) \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
|
||||
bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
|
||||
(\
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
void scopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (float*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (float*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_scopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void dcopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (double*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (double*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel */
|
||||
bli_dcopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -90,681 +90,11 @@ ftype PASTEF772(ch,blasname,chc) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
float sdot_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
float rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_sdotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
|
||||
double ddot_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx,
|
||||
const double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
#else
|
||||
INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
scomplex cdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
|
||||
dcomplex zdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
|
||||
scomplex cdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
dcomplex zdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
#else
|
||||
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
|
||||
#endif
|
||||
#else
|
||||
// For the "intel" complex return type, use a hidden parameter to return the result
|
||||
#undef GENTFUNCDOT
|
||||
@@ -819,8 +149,8 @@ void PASTEF772(ch,blasname,chc) \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
|
||||
#endif
|
||||
#endif
|
||||
#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
|
||||
#endif // BLIS_ENABLE_BLAS
|
||||
|
||||
|
||||
// -- "Black sheep" dot product function definitions --
|
||||
@@ -894,4 +224,4 @@ double PASTEF77(d,sdot)
|
||||
return rho;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // BLIS_ENABLE_BLAS
|
||||
|
||||
841
frame/compat/bla_dot_amd.c
Normal file
841
frame/compat/bla_dot_amd.c
Normal file
@@ -0,0 +1,841 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNCDOT
|
||||
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
|
||||
\
|
||||
ftype PASTEF772(ch,blasname,chc) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
const ftype* x, const f77_int* incx, \
|
||||
const ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
ftype rho; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_conjx, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
&rho, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
return rho; \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
float sdot_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
float rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((float*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_sdotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
|
||||
double ddot_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx,
|
||||
const double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((double*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
|
||||
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
|
||||
scomplex cdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return rho;
|
||||
}
|
||||
|
||||
dcomplex zdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
|
||||
scomplex cdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
dcomplex zdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
#else // BLIS_DISABLE_COMPLEX_RETURN_INTEL
|
||||
// For the "intel" complex return type, use a hidden parameter to return the result
|
||||
#undef GENTFUNCDOT
|
||||
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF772(ch,blasname,chc) \
|
||||
( \
|
||||
ftype* rhop, \
|
||||
const f77_int* n, \
|
||||
const ftype* x, const f77_int* incx, \
|
||||
const ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
ftype rho; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_conjx, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
&rho, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
*rhop = rho; \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
|
||||
#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
|
||||
|
||||
|
||||
|
||||
// -- "Black sheep" dot product function definitions --
|
||||
|
||||
// Input vectors stored in single precision, computed in double precision,
|
||||
// with result returned in single precision.
|
||||
float PASTEF77(sd,sdot)
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* sb,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
return ( float )
|
||||
(
|
||||
( double )(*sb) +
|
||||
PASTEF77(d,sdot)
|
||||
(
|
||||
n,
|
||||
x, incx,
|
||||
y, incy
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Input vectors stored in single precision, computed in double precision,
|
||||
// with result returned in double precision.
|
||||
double PASTEF77(d,sdot)
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
dim_t i;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
|
||||
/* Initialization of BLIS is not required. */
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
|
||||
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
|
||||
|
||||
rho = 0.0;
|
||||
|
||||
for ( i = 0; i < n0; i++ )
|
||||
{
|
||||
float* chi1 = x0 + (i )*incx0;
|
||||
float* psi1 = y0 + (i )*incy0;
|
||||
|
||||
bli_ddots( (( double )(*chi1)),
|
||||
(( double )(*psi1)), rho );
|
||||
}
|
||||
|
||||
/* Finalization of BLIS is not required, because initialization was
|
||||
not required. */
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -300,512 +300,7 @@ void PASTEF77(ch,blasname) \
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void dgemm_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const double* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* b, const f77_int* ldb,
|
||||
const double* beta,
|
||||
double* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
|
||||
|
||||
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
lda,
|
||||
ldb,
|
||||
ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
|
||||
bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1(*m, m0);
|
||||
bli_convert_blas_dim1(*n, n0);
|
||||
bli_convert_blas_dim1(*k, k0);
|
||||
|
||||
|
||||
/* Set the row and column strides of the matrix operands. */
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = *lda;
|
||||
const inc_t rs_b = 1;
|
||||
const inc_t cs_b = *ldb;
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (!bamdzen)
|
||||
{
|
||||
// This code is duplicated below, however we don't want to move it out of
|
||||
// this IF block as it will affect the performance on Zen architetures
|
||||
// Also this is temporary fix which will be replaced later.
|
||||
const num_t dt = BLIS_DOUBLE;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
|
||||
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
|
||||
|
||||
bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
|
||||
bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
|
||||
|
||||
bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
|
||||
bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
|
||||
bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
|
||||
|
||||
bli_obj_set_conjtrans(blis_transa, &ao);
|
||||
bli_obj_set_conjtrans(blis_transb, &bo);
|
||||
|
||||
// Will call parallelized dgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
|
||||
{
|
||||
bli_dgemm_ref_k1_nn( m0, n0, k0,
|
||||
(double*)alpha,
|
||||
(double*)a, *lda,
|
||||
(double*)b, *ldb,
|
||||
(double*)beta,
|
||||
c, *ldc
|
||||
);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS */
|
||||
bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (n0 == 1)
|
||||
{
|
||||
if (bli_is_notrans(blis_transa))
|
||||
{
|
||||
bli_dgemv_unf_var2(
|
||||
BLIS_NO_TRANSPOSE,
|
||||
bli_extract_conj(blis_transb),
|
||||
m0, k0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
|
||||
(double*)beta,
|
||||
c, rs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dgemv_unf_var1(
|
||||
blis_transa,
|
||||
bli_extract_conj(blis_transb),
|
||||
k0, m0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
|
||||
(double*)beta,
|
||||
c, rs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
return;
|
||||
}
|
||||
else if (m0 == 1)
|
||||
{
|
||||
if (bli_is_notrans(blis_transb))
|
||||
{
|
||||
bli_dgemv_unf_var1(
|
||||
blis_transb,
|
||||
bli_extract_conj(blis_transa),
|
||||
n0, k0,
|
||||
(double*)alpha,
|
||||
(double*)b, cs_b, rs_b,
|
||||
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
|
||||
(double*)beta,
|
||||
c, cs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dgemv_unf_var2(
|
||||
blis_transb,
|
||||
bli_extract_conj(blis_transa),
|
||||
k0, n0,
|
||||
(double*)alpha,
|
||||
(double*)b, cs_b, rs_b,
|
||||
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
|
||||
(double*)beta,
|
||||
c, cs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
return;
|
||||
}
|
||||
|
||||
const num_t dt = BLIS_DOUBLE;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
|
||||
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
|
||||
|
||||
bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
|
||||
bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
|
||||
|
||||
bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
|
||||
bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
|
||||
bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
|
||||
|
||||
bli_obj_set_conjtrans(blis_transa, &ao);
|
||||
bli_obj_set_conjtrans(blis_transb, &bo);
|
||||
|
||||
//cntx_t* cntx = bli_gks_query_cntx();
|
||||
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
|
||||
bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
|
||||
|
||||
// if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
|
||||
//
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
if (nt && ((n0 > 10 ) || (k0 > 10)) )
|
||||
#else
|
||||
if (nt)
|
||||
#endif
|
||||
{
|
||||
// Will call parallelized dgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// The code below will be called when number of threads = 1.
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
|
||||
//if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
|
||||
if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
|
||||
((n0 <= 10) && (k0 <=10)) )
|
||||
{
|
||||
err_t status;
|
||||
if (bli_is_notrans(blis_transa))
|
||||
{
|
||||
status = bli_dgemm_small( &alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL, //cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = bli_dgemm_small_At ( &alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL, //cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
if (status == BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
|
||||
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
if (status == BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
return;
|
||||
}
|
||||
|
||||
// fall back on native path when dgemm is not handled in sup path.
|
||||
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
|
||||
|
||||
/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
|
||||
/* ( */
|
||||
/* &alphao, */
|
||||
/* &ao, */
|
||||
/* &bo, */
|
||||
/* &betao, */
|
||||
/* &co, */
|
||||
/* NULL, */
|
||||
/* NULL */
|
||||
/* ); */
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
} // end of dgemm_
|
||||
|
||||
void zgemm_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* a, const f77_int* lda,
|
||||
const dcomplex* b, const f77_int* ldb,
|
||||
const dcomplex* beta,
|
||||
dcomplex* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm)
|
||||
(
|
||||
MKSTR(z),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
lda,
|
||||
ldb,
|
||||
ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1( *m, m0 );
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
bli_convert_blas_dim1( *k, k0 );
|
||||
|
||||
/* Set the row and column strides of the matrix operands. */
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = *lda;
|
||||
const inc_t rs_b = 1;
|
||||
const inc_t cs_b = *ldb;
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
const num_t dt = BLIS_DCOMPLEX;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
|
||||
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
|
||||
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
|
||||
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
|
||||
|
||||
bli_obj_set_conjtrans( blis_transa, &ao );
|
||||
bli_obj_set_conjtrans( blis_transb, &bo );
|
||||
|
||||
// default instance peformance tuning is done in zgemm.
|
||||
// Single instance tuning is done based on env set.
|
||||
dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
|
||||
|
||||
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
|
||||
bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
|
||||
if ( nt )
|
||||
{
|
||||
// Will call parallelized zgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// The code below will be called when number of threads = 1.
|
||||
#if ENABLE_INDUCED_METHOD
|
||||
/* 3m_sqp is optimal for certain matrix shapes.
|
||||
Initial study that it works well for square sizes and sizes closer to square shape.
|
||||
|
||||
* Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
|
||||
* Further investigation is necessary to make the usage choices more generic. */
|
||||
bool sqp_on = false;
|
||||
if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
|
||||
{
|
||||
sqp_on = true;
|
||||
}
|
||||
|
||||
// current range of sizes used for 3m_sqp to be expaned after evaluation.
|
||||
if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
|
||||
&& ( k0 == 1120 ) ) //to be tuned further.
|
||||
{
|
||||
sqp_on = true;
|
||||
}
|
||||
|
||||
if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
|
||||
{
|
||||
//sqp algo is found better for n > 40
|
||||
if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif//ENABLE_INDUCED_METHOD
|
||||
|
||||
// native tuning resulted in better numbers compared to sup in constrained multi-instance
|
||||
// sup has been enabled for single instance cases.
|
||||
if(single_instance==1)
|
||||
{
|
||||
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
if(status==BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
// fall back on native path when zgemm is not handled in sup path.
|
||||
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
}// end of zgemm_
|
||||
|
||||
|
||||
INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( gemm,gemm )
|
||||
#endif
|
||||
|
||||
// Observed a regression in dgemm with this function addition.
|
||||
// Disabling temporarily.
|
||||
|
||||
894
frame/compat/bla_gemm_amd.c
Normal file
894
frame/compat/bla_gemm_amd.c
Normal file
@@ -0,0 +1,894 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#define ENABLE_INDUCED_METHOD 0
|
||||
#ifdef BLIS_BLAS3_CALLS_TAPI
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
dim_t m0, n0, k0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
inc_t rs_b, cs_b; \
|
||||
inc_t rs_c, cs_c; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
transa, \
|
||||
transb, \
|
||||
m, \
|
||||
n, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
rs_b = 1; \
|
||||
cs_b = *ldb; \
|
||||
rs_c = 1; \
|
||||
cs_c = *ldc; \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_transa, \
|
||||
blis_transb, \
|
||||
m0, \
|
||||
n0, \
|
||||
k0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
(ftype*)b, rs_b, cs_b, \
|
||||
(ftype*)beta, \
|
||||
(ftype*)c, rs_c, cs_c, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
\
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
dim_t m0, n0, k0; \
|
||||
\
|
||||
dim_t m0_a, n0_a; \
|
||||
dim_t m0_b, n0_b; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
transa, \
|
||||
transb, \
|
||||
m, \
|
||||
n, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = *lda; \
|
||||
const inc_t rs_b = 1; \
|
||||
const inc_t cs_b = *ldb; \
|
||||
const inc_t rs_c = 1; \
|
||||
const inc_t cs_c = *ldc; \
|
||||
\
|
||||
if( n0 == 1 ) \
|
||||
{ \
|
||||
if(bli_is_notrans(blis_transa)) \
|
||||
{ \
|
||||
PASTEMAC(ch,gemv_unf_var2)( \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
bli_extract_conj(blis_transb), \
|
||||
m0, k0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a,\
|
||||
(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
|
||||
(ftype*) beta, \
|
||||
c, rs_c, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,gemv_unf_var1)( \
|
||||
blis_transa, \
|
||||
bli_extract_conj(blis_transb), \
|
||||
k0, m0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
|
||||
(ftype*)beta, \
|
||||
c, rs_c, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
|
||||
return; \
|
||||
} \
|
||||
else if( m0 == 1 ) \
|
||||
{ \
|
||||
if(bli_is_notrans(blis_transb)) \
|
||||
{ \
|
||||
PASTEMAC(ch,gemv_unf_var1)( \
|
||||
blis_transb, \
|
||||
bli_extract_conj(blis_transa), \
|
||||
n0, k0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)b, cs_b, rs_b, \
|
||||
(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
|
||||
(ftype*)beta, \
|
||||
c, cs_c, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,gemv_unf_var2)( \
|
||||
blis_transb, \
|
||||
bli_extract_conj(blis_transa), \
|
||||
k0, n0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)b, cs_b, rs_b, \
|
||||
(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
|
||||
(ftype*)beta, \
|
||||
c, cs_c, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER; \
|
||||
\
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
|
||||
\
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
|
||||
\
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_conjtrans( blis_transa, &ao ); \
|
||||
bli_obj_set_conjtrans( blis_transb, &bo ); \
|
||||
\
|
||||
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
|
||||
( \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void dgemm_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const double* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* b, const f77_int* ldb,
|
||||
const double* beta,
|
||||
double* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
|
||||
|
||||
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
lda,
|
||||
ldb,
|
||||
ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
|
||||
bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1(*m, m0);
|
||||
bli_convert_blas_dim1(*n, n0);
|
||||
bli_convert_blas_dim1(*k, k0);
|
||||
|
||||
|
||||
/* Set the row and column strides of the matrix operands. */
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = *lda;
|
||||
const inc_t rs_b = 1;
|
||||
const inc_t cs_b = *ldb;
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
// This code is duplicated below, however we don't want to move it out of
|
||||
// this IF block as it will affect the performance on Zen architetures
|
||||
// Also this is temporary fix which will be replaced later.
|
||||
const num_t dt = BLIS_DOUBLE;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
|
||||
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
|
||||
|
||||
bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
|
||||
bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
|
||||
|
||||
bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
|
||||
bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
|
||||
bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
|
||||
|
||||
bli_obj_set_conjtrans(blis_transa, &ao);
|
||||
bli_obj_set_conjtrans(blis_transb, &bo);
|
||||
|
||||
// Will call parallelized dgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
|
||||
{
|
||||
bli_dgemm_ref_k1_nn( m0, n0, k0,
|
||||
(double*)alpha,
|
||||
(double*)a, *lda,
|
||||
(double*)b, *ldb,
|
||||
(double*)beta,
|
||||
c, *ldc
|
||||
);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS */
|
||||
bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (n0 == 1)
|
||||
{
|
||||
if (bli_is_notrans(blis_transa))
|
||||
{
|
||||
bli_dgemv_unf_var2(
|
||||
BLIS_NO_TRANSPOSE,
|
||||
bli_extract_conj(blis_transb),
|
||||
m0, k0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
|
||||
(double*)beta,
|
||||
c, rs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dgemv_unf_var1(
|
||||
blis_transa,
|
||||
bli_extract_conj(blis_transb),
|
||||
k0, m0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
|
||||
(double*)beta,
|
||||
c, rs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
return;
|
||||
}
|
||||
else if (m0 == 1)
|
||||
{
|
||||
if (bli_is_notrans(blis_transb))
|
||||
{
|
||||
bli_dgemv_unf_var1(
|
||||
blis_transb,
|
||||
bli_extract_conj(blis_transa),
|
||||
n0, k0,
|
||||
(double*)alpha,
|
||||
(double*)b, cs_b, rs_b,
|
||||
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
|
||||
(double*)beta,
|
||||
c, cs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dgemv_unf_var2(
|
||||
blis_transb,
|
||||
bli_extract_conj(blis_transa),
|
||||
k0, n0,
|
||||
(double*)alpha,
|
||||
(double*)b, cs_b, rs_b,
|
||||
(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
|
||||
(double*)beta,
|
||||
c, cs_c,
|
||||
((void*)0)
|
||||
);
|
||||
}
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
return;
|
||||
}
|
||||
|
||||
const num_t dt = BLIS_DOUBLE;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
|
||||
bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
|
||||
|
||||
bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
|
||||
bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
|
||||
|
||||
bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
|
||||
bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
|
||||
bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
|
||||
|
||||
bli_obj_set_conjtrans(blis_transa, &ao);
|
||||
bli_obj_set_conjtrans(blis_transb, &bo);
|
||||
|
||||
//cntx_t* cntx = bli_gks_query_cntx();
|
||||
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
|
||||
bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
|
||||
|
||||
// if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
|
||||
//
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
if (nt && ((n0 > 10 ) || (k0 > 10)) )
|
||||
#else
|
||||
if (nt)
|
||||
#endif
|
||||
{
|
||||
// Will call parallelized dgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// The code below will be called when number of threads = 1.
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
|
||||
//if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
|
||||
if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
|
||||
((n0 <= 10) && (k0 <=10)) )
|
||||
{
|
||||
err_t status;
|
||||
if (bli_is_notrans(blis_transa))
|
||||
{
|
||||
status = bli_dgemm_small( &alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL, //cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = bli_dgemm_small_At ( &alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL, //cntx,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
if (status == BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
|
||||
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
if (status == BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
return;
|
||||
}
|
||||
|
||||
// fall back on native path when dgemm is not handled in sup path.
|
||||
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
|
||||
|
||||
/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
|
||||
/* ( */
|
||||
/* &alphao, */
|
||||
/* &ao, */
|
||||
/* &bo, */
|
||||
/* &betao, */
|
||||
/* &co, */
|
||||
/* NULL, */
|
||||
/* NULL */
|
||||
/* ); */
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
} // end of dgemm_
|
||||
|
||||
void zgemm_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* a, const f77_int* lda,
|
||||
const dcomplex* b, const f77_int* ldb,
|
||||
const dcomplex* beta,
|
||||
dcomplex* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm)
|
||||
(
|
||||
MKSTR(z),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
lda,
|
||||
ldb,
|
||||
ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1( *m, m0 );
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
bli_convert_blas_dim1( *k, k0 );
|
||||
|
||||
/* Set the row and column strides of the matrix operands. */
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = *lda;
|
||||
const inc_t rs_b = 1;
|
||||
const inc_t cs_b = *ldb;
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
const num_t dt = BLIS_DCOMPLEX;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
|
||||
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
|
||||
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
|
||||
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
|
||||
|
||||
bli_obj_set_conjtrans( blis_transa, &ao );
|
||||
bli_obj_set_conjtrans( blis_transb, &bo );
|
||||
|
||||
// default instance peformance tuning is done in zgemm.
|
||||
// Single instance tuning is done based on env set.
|
||||
dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
|
||||
|
||||
//dim_t nt = bli_thread_get_num_threads(); // get number of threads
|
||||
bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
|
||||
if ( nt )
|
||||
{
|
||||
// Will call parallelized zgemm code - sup & native
|
||||
PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
|
||||
(
|
||||
&alphao,
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// The code below will be called when number of threads = 1.
|
||||
#if ENABLE_INDUCED_METHOD
|
||||
/* 3m_sqp is optimal for certain matrix shapes.
|
||||
Initial study that it works well for square sizes and sizes closer to square shape.
|
||||
|
||||
* Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
|
||||
* Further investigation is necessary to make the usage choices more generic. */
|
||||
bool sqp_on = false;
|
||||
if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
|
||||
{
|
||||
sqp_on = true;
|
||||
}
|
||||
|
||||
// current range of sizes used for 3m_sqp to be expaned after evaluation.
|
||||
if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
|
||||
&& ( k0 == 1120 ) ) //to be tuned further.
|
||||
{
|
||||
sqp_on = true;
|
||||
}
|
||||
|
||||
if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
|
||||
{
|
||||
//sqp algo is found better for n > 40
|
||||
if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif//ENABLE_INDUCED_METHOD
|
||||
|
||||
// native tuning resulted in better numbers compared to sup in constrained multi-instance
|
||||
// sup has been enabled for single instance cases.
|
||||
if(single_instance==1)
|
||||
{
|
||||
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
if(status==BLIS_SUCCESS)
|
||||
{
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
// fall back on native path when zgemm is not handled in sup path.
|
||||
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
}// end of zgemm_
|
||||
|
||||
|
||||
INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
|
||||
|
||||
|
||||
// Observed a regression in dgemm with this function addition.
|
||||
// Disabling temporarily.
|
||||
#if 0
|
||||
void dzgemm_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const dcomplex* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const dcomplex* b, const f77_int* ldb,
|
||||
const dcomplex* beta,
|
||||
dcomplex* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
|
||||
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm)
|
||||
(
|
||||
MKSTR(z),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
lda,
|
||||
ldb,
|
||||
ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1( *m, m0 );
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
bli_convert_blas_dim1( *k, k0 );
|
||||
|
||||
/* Set the row and column strides of the matrix operands. */
|
||||
const inc_t rs_a = 1;
|
||||
const inc_t cs_a = *lda;
|
||||
const inc_t rs_b = 1;
|
||||
const inc_t cs_b = *ldb;
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
const num_t dt = BLIS_DCOMPLEX;
|
||||
const num_t dt_a = BLIS_DOUBLE;
|
||||
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
|
||||
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
|
||||
bli_obj_init_finish_1x1( dt, (dcomplex*)beta, &betao );
|
||||
|
||||
bli_obj_init_finish( dt_a, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
|
||||
bli_obj_init_finish( dt, m0, n0, (dcomplex*)c, rs_c, cs_c, &co );
|
||||
|
||||
bli_obj_set_conjtrans( blis_transa, &ao );
|
||||
bli_obj_set_conjtrans( blis_transb, &bo );
|
||||
|
||||
// fall back on native path when zgemm is not handled in sup path.
|
||||
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
|
||||
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
}// end of dzgemm_
|
||||
#endif
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -147,856 +147,5 @@ void PASTEF77(ch,blasname) \
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
void dgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* x, const f77_int* incx,
|
||||
const double* beta,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
m_y = m0;
|
||||
n_x = n0;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_y = n0;
|
||||
n_x = m0;
|
||||
}
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((double*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
//variant_2 is chosen for column-storage
|
||||
// and uses axpyf-based implementation
|
||||
bli_dgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
//var_1 is chosen for row-storage
|
||||
//and uses dotxf-based implementation
|
||||
bli_dgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
void sgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
const float* a, const f77_int* lda,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* beta,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(s),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
m_y = m0;
|
||||
n_x = n0;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_y = n0;
|
||||
n_x = m0;
|
||||
}
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((float*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
bli_sgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_sgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
void cgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const scomplex* alpha,
|
||||
const scomplex* a, const f77_int* lda,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* beta,
|
||||
scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(c),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
// bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if( *m < 0 ) m0 = (dim_t)0;
|
||||
else m0 = (dim_t)(*m);
|
||||
|
||||
if( *n < 0 ) n0 = (dim_t)0;
|
||||
else n0 = (dim_t)(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if( *incx < 0 )
|
||||
{
|
||||
x0 = ((scomplex*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if( m_y == 1 )
|
||||
{
|
||||
conj_t conja = bli_extract_conj(blis_transa);
|
||||
scomplex rho;
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
scomplex yval = *y0;
|
||||
if(!bli_ceq0(*beta))
|
||||
{
|
||||
bli_cscals( *beta, yval );
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_csetsc( 0.0, 0.0, &yval);
|
||||
}
|
||||
if(!bli_ceq0(*alpha))
|
||||
{
|
||||
bli_caxpys( *alpha, rho, yval);
|
||||
}
|
||||
y0->real = yval.real;
|
||||
y0->imag = yval.imag;
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* call variants based on transpose value */
|
||||
if( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
bli_cgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_cgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
void zgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* a, const f77_int* lda,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* beta,
|
||||
dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(z),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
// bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if( *m < 0 ) m0 = (dim_t)0;
|
||||
else m0 = (dim_t)(*m);
|
||||
|
||||
if( *n < 0 ) n0 = (dim_t)0;
|
||||
else n0 = (dim_t)(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if( *incx < 0 )
|
||||
{
|
||||
x0 = ((dcomplex*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if( m_y == 1 )
|
||||
{
|
||||
conj_t conja = bli_extract_conj(blis_transa);
|
||||
dcomplex rho;
|
||||
|
||||
if (bamdzen)
|
||||
{
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
dcomplex yval = *y0;
|
||||
if(!bli_zeq0(*beta))
|
||||
{
|
||||
bli_zscals( *beta, yval );
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_zsetsc( 0.0, 0.0, &yval);
|
||||
}
|
||||
if(!bli_zeq0(*alpha))
|
||||
{
|
||||
bli_zaxpys( *alpha, rho, yval);
|
||||
}
|
||||
y0->real = yval.real;
|
||||
y0->imag = yval.imag;
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (bamdzen == 0)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* call variants based on transpose value */
|
||||
if( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
bli_zgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_zgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( gemv, gemv )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
963
frame/compat/bla_gemv_amd.c
Normal file
963
frame/compat/bla_gemv_amd.c
Normal file
@@ -0,0 +1,963 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* transa, \
|
||||
const f77_int* m, \
|
||||
const f77_int* n, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* x, const f77_int* incx, \
|
||||
const ftype* beta, \
|
||||
ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \
|
||||
trans_t blis_transa; \
|
||||
dim_t m0, n0; \
|
||||
dim_t m_y, n_x; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
transa, \
|
||||
m, \
|
||||
n, \
|
||||
lda, \
|
||||
incx, \
|
||||
incy \
|
||||
); \
|
||||
\
|
||||
if (*m == 0 || *n == 0) { \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
\
|
||||
/* Convert/typecast negative values of m and n to zero. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/ \
|
||||
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
|
||||
\
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */ \
|
||||
if ( m_y > 0 && n_x == 0 ) \
|
||||
{ \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Set the row and column strides of A. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_transa, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m0, \
|
||||
n0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
x0, incx0, \
|
||||
(ftype*)beta, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void dgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* x, const f77_int* incx,
|
||||
const double* beta,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
m_y = m0;
|
||||
n_x = n0;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_y = n0;
|
||||
n_x = m0;
|
||||
}
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((double*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
//variant_2 is chosen for column-storage
|
||||
// and uses axpyf-based implementation
|
||||
bli_dgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
//var_1 is chosen for row-storage
|
||||
//and uses dotxf-based implementation
|
||||
bli_dgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
void sgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
const float* a, const f77_int* lda,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* beta,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(s),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
m_y = m0;
|
||||
n_x = n0;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_y = n0;
|
||||
n_x = m0;
|
||||
}
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((float*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
bli_sgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_sgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
void cgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const scomplex* alpha,
|
||||
const scomplex* a, const f77_int* lda,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* beta,
|
||||
scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(c),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
// bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if( *m < 0 ) m0 = (dim_t)0;
|
||||
else m0 = (dim_t)(*m);
|
||||
|
||||
if( *n < 0 ) n0 = (dim_t)0;
|
||||
else n0 = (dim_t)(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if( *incx < 0 )
|
||||
{
|
||||
x0 = ((scomplex*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
if( m_y == 1 )
|
||||
{
|
||||
conj_t conja = bli_extract_conj(blis_transa);
|
||||
scomplex rho;
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
scomplex yval = *y0;
|
||||
if(!bli_ceq0(*beta))
|
||||
{
|
||||
bli_cscals( *beta, yval );
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_csetsc( 0.0, 0.0, &yval);
|
||||
}
|
||||
if(!bli_ceq0(*alpha))
|
||||
{
|
||||
bli_caxpys( *alpha, rho, yval);
|
||||
}
|
||||
y0->real = yval.real;
|
||||
y0->imag = yval.imag;
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* call variants based on transpose value */
|
||||
if( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
bli_cgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_cgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(scomplex*)alpha,
|
||||
(scomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(scomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
void zgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const dcomplex* alpha,
|
||||
const dcomplex* a, const f77_int* lda,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* beta,
|
||||
dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
|
||||
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(z),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
if (*m == 0 || *n == 0)
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
// bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if( *m < 0 ) m0 = (dim_t)0;
|
||||
else m0 = (dim_t)(*m);
|
||||
|
||||
if( *n < 0 ) n0 = (dim_t)0;
|
||||
else n0 = (dim_t)(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if( *incx < 0 )
|
||||
{
|
||||
x0 = ((dcomplex*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
if( m_y == 1 )
|
||||
{
|
||||
conj_t conja = bli_extract_conj(blis_transa);
|
||||
dcomplex rho;
|
||||
|
||||
if (bli_cpuid_is_avx_supported() == TRUE)
|
||||
{
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_x,
|
||||
(dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
|
||||
x0, incx0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
dcomplex yval = *y0;
|
||||
if(!bli_zeq0(*beta))
|
||||
{
|
||||
bli_zscals( *beta, yval );
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_zsetsc( 0.0, 0.0, &yval);
|
||||
}
|
||||
if(!bli_zeq0(*alpha))
|
||||
{
|
||||
bli_zaxpys( *alpha, rho, yval);
|
||||
}
|
||||
y0->real = yval.real;
|
||||
y0->imag = yval.imag;
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* call variants based on transpose value */
|
||||
if( bli_does_notrans( blis_transa ) )
|
||||
{
|
||||
bli_zgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_zgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(dcomplex*)alpha,
|
||||
(dcomplex*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(dcomplex*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -93,179 +93,5 @@ void PASTEF772(chx,cha,blasname) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void sscal_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
float* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
inc_t incx0;
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
if (*n == 0 || alpha == NULL) {
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
/* Call BLIS kernel */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float *)alpha,
|
||||
x0, incx0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE,\
|
||||
n0, \
|
||||
(float *)alpha,\
|
||||
x0, incx0,\
|
||||
NULL, \
|
||||
NULL \
|
||||
);\
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
void dscal_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
double* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
inc_t incx0;
|
||||
|
||||
/* Initialize BLIS */
|
||||
//bli_init_auto();
|
||||
|
||||
if (*n == 0 || alpha == NULL) {
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
/* Call BLIS kernel */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen){
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*) alpha,
|
||||
x0, incx0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE,\
|
||||
n0, \
|
||||
(double *)alpha,\
|
||||
x0, incx0,\
|
||||
NULL, \
|
||||
NULL \
|
||||
);\
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
|
||||
#else
|
||||
INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
260
frame/compat/bla_scal_amd.c
Normal file
260
frame/compat/bla_scal_amd.c
Normal file
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNCSCAL
|
||||
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF772(chx,cha,blasname) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
const ftype_a* alpha, \
|
||||
ftype_x* x, const f77_int* incx \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
dim_t n0; \
|
||||
ftype_x* x0; \
|
||||
inc_t incx0; \
|
||||
ftype_x alpha_cast; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
if (*n == 0 || alpha == NULL) { \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
return ; \
|
||||
} \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
|
||||
\
|
||||
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
|
||||
that is, we just always sub-optimally implement those cases
|
||||
by casting alpha to ctype_x (potentially the complex domain) and
|
||||
using the homogeneous datatype instance according to that type. */ \
|
||||
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
&alpha_cast, \
|
||||
x0, incx0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
void sscal_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
float* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
inc_t incx0;
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
if (*n == 0 || alpha == NULL) {
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE) {
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(float *)alpha,
|
||||
x0, incx0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE,\
|
||||
n0, \
|
||||
(float *)alpha,\
|
||||
x0, incx0,\
|
||||
NULL, \
|
||||
NULL \
|
||||
);\
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
void dscal_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
double* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
inc_t incx0;
|
||||
|
||||
/* Initialize BLIS */
|
||||
//bli_init_auto();
|
||||
|
||||
if (*n == 0 || alpha == NULL) {
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE){
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
(double*) alpha,
|
||||
x0, incx0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE,\
|
||||
n0, \
|
||||
(double *)alpha,\
|
||||
x0, incx0,\
|
||||
NULL, \
|
||||
NULL \
|
||||
);\
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
|
||||
|
||||
#endif
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -83,198 +83,5 @@ void PASTEF77(ch,blasname) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void sswap_
|
||||
(
|
||||
const f77_int* n,
|
||||
float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = (y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
/* Call BLIS kernel */
|
||||
bli_sswapv_zen_int8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
void dswap_
|
||||
(
|
||||
const f77_int* n,
|
||||
double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = (y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
|
||||
/* Call BLIS kernel */
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (bamdzen) {
|
||||
bli_dswapv_zen_int8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( swap, swapv )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
268
frame/compat/bla_swap_amd.c
Normal file
268
frame/compat/bla_swap_amd.c
Normal file
@@ -0,0 +1,268 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_int* n, \
|
||||
ftype* x, const f77_int* incx, \
|
||||
ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
void sswap_
|
||||
(
|
||||
const f77_int* n,
|
||||
float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = (y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE) {
|
||||
/* Call BLIS kernel */
|
||||
bli_sswapv_zen_int8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
void dswap_
|
||||
(
|
||||
const f77_int* n,
|
||||
double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = (y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == TRUE) {
|
||||
bli_dswapv_zen_int8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else{
|
||||
PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
|
||||
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
1544
frame/compat/bla_trsm_amd.c
Normal file
1544
frame/compat/bla_trsm_amd.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -64,16 +64,7 @@ void bli_sscalv_zen_int10
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
bli_ssetv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#else
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
f
|
||||
(
|
||||
@@ -83,7 +74,7 @@ void bli_sscalv_zen_int10
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -342,16 +333,7 @@ void bli_dscalv_zen_int10
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
bli_dsetv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#else
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
@@ -362,7 +344,7 @@ void bli_dscalv_zen_int10
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -95,29 +95,6 @@ void bli_caxpyf_zen_int_4
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
scomplex* a1 = a + (0 )*inca + (i )*lda;
|
||||
scomplex* chi1 = x + (i )*incx;
|
||||
scomplex* y1 = y + (0 )*incy;
|
||||
scomplex alpha_chi1;
|
||||
|
||||
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_cscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_caxpyv_zen_int5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -141,7 +118,6 @@ void bli_caxpyf_zen_int_4
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -357,28 +333,6 @@ void bli_zaxpyf_zen_int_4
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
dcomplex* a1 = a + (0 )*inca + (i )*lda;
|
||||
dcomplex* chi1 = x + (i )*incx;
|
||||
dcomplex* y1 = y + (0 )*incy;
|
||||
dcomplex alpha_chi1;
|
||||
|
||||
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_zscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_zaxpyv_zen_int5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
#else
|
||||
zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -402,7 +356,6 @@ void bli_zaxpyf_zen_int_4
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -108,29 +108,6 @@ void bli_saxpyf_zen_int_5
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* chi1 = x + (i )*incx;
|
||||
float* y1 = y + (0 )*incy;
|
||||
float alpha_chi1;
|
||||
|
||||
bli_scopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_sscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_saxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -154,7 +131,6 @@ void bli_saxpyf_zen_int_5
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -382,29 +358,6 @@ void bli_daxpyf_zen_int_5
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_dscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -428,7 +381,6 @@ void bli_daxpyf_zen_int_5
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -655,29 +607,6 @@ static void bli_daxpyf_zen_int_16x2
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_dscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -701,7 +630,6 @@ static void bli_daxpyf_zen_int_16x2
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -966,43 +894,21 @@ void bli_daxpyf_zen_int_16x4
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
if(b_n & 2)
|
||||
{
|
||||
bli_daxpyf_zen_int_16x2( conja,
|
||||
conjx,
|
||||
m, 2,
|
||||
alpha, a, inca, lda,
|
||||
x, incx,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
b_n -= 2;
|
||||
a += 2*lda;
|
||||
x += 2 * incx;
|
||||
}
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
if (b_n & 2)
|
||||
{
|
||||
bli_daxpyf_zen_int_16x2( conja,
|
||||
conjx,
|
||||
m, 2,
|
||||
alpha, a, inca, lda,
|
||||
x, incx,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
b_n -= 2;
|
||||
a += 2*lda;
|
||||
x += 2 * incx;
|
||||
}
|
||||
|
||||
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_dscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -1026,7 +932,6 @@ void bli_daxpyf_zen_int_16x4
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1396,29 +1301,6 @@ void bli_caxpyf_zen_int_5
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
scomplex* a1 = a + (0 )*inca + (i )*lda;
|
||||
scomplex* chi1 = x + (i )*incx;
|
||||
scomplex* y1 = y + (0 )*incy;
|
||||
scomplex alpha_chi1;
|
||||
|
||||
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_cscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_caxpyv_zen_int5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -1442,7 +1324,6 @@ void bli_caxpyf_zen_int_5
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1810,29 +1691,6 @@ void bli_zaxpyf_zen_int_5
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
dcomplex* a1 = a + (0 )*inca + (i )*lda;
|
||||
dcomplex* chi1 = x + (i )*incx;
|
||||
dcomplex* y1 = y + (0 )*incy;
|
||||
dcomplex alpha_chi1;
|
||||
|
||||
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_zscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_zaxpyv_zen_int5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -1855,8 +1713,7 @@ void bli_zaxpyf_zen_int_5
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -97,28 +97,6 @@ void bli_saxpyf_zen_int_6
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* chi1 = x + (i )*incx;
|
||||
float* y1 = y + (0 )*incy;
|
||||
float alpha_chi1;
|
||||
|
||||
bli_scopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_sscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_saxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
#else
|
||||
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
@@ -141,7 +119,7 @@ void bli_saxpyf_zen_int_6
|
||||
cntx
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -114,16 +114,9 @@ err_t bli_gemm_small
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
|
||||
return BLIS_NOT_YET_IMPLEMENTED;
|
||||
#else
|
||||
// When dynamic dispatch is enabled i.e. library is built for 'amdzen' configuration.
|
||||
// Invoke architecture specific kernels only if we are sure that we are running on zen,
|
||||
// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
|
||||
arch_t id = bli_arch_query_id();
|
||||
bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
|
||||
(id == BLIS_ARCH_ZEN3) ||
|
||||
(id == BLIS_ARCH_ZEN2) ||
|
||||
(id == BLIS_ARCH_ZEN);
|
||||
|
||||
if (0 == bamdzen)
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// Non-AVX platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx_supported() == FALSE)
|
||||
{
|
||||
return BLIS_NOT_YET_IMPLEMENTED;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user