frame/3/gemm/bli_gemm_front.c

Change-Id: I52a0fbc1d33bb948d430942323bbc5fe44e3ca13
This commit is contained in:
praveeng
2017-05-20 16:53:50 +05:30
163 changed files with 10350 additions and 2347 deletions

1179
CHANGELOG

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -260,7 +260,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving
```
A fourth paper, submitted to ACM TOMS, also exists, which proposes an
[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS:
[analytical model](http://dl.acm.org/citation.cfm?id=2925987)
([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf))
for determining blocksize parameters in BLIS:
```
@article{BLIS4,
@@ -278,6 +280,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an
}
```
A fifth paper, submitted to ACM TOMS, begins the study of so-called
[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf):
```
@article{BLIS5,
author = {Field G. {V}an~{Z}ee and Tyler Smith},
title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods},
journal = {ACM Transactions on Mathematical Software},
year = {2017},
note = {accepted}
}
```
A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf):
```
@article{BLIS6,
author = {Field G. {V}an~{Z}ee},
title = {Implementing high-performance complex matrix multiplication via the 1m method},
journal = {ACM Transactions on Mathematical Software},
note = {submitted}
}
```
Funding
-------

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -41,7 +41,11 @@
main()
{
CC=gcc
if [ clang -v > /dev/null 2>&1 ]; then
CC=clang
else
CC=gcc
fi
CPUID_SRC=cpuid_x86.c
CPUID_BIN=blis_cpu_detect
ARCH=reference
@@ -59,12 +63,6 @@ main()
# of the distribution and the directory in which we are building.
cur_dirpath="."
OSNAME=`uname`
if [ $OSNAME = "Darwin" ]; then
CC=clang
fi
#
# Detect architecture by predefined macros
#

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -47,4 +47,4 @@ if [ $? -eq 0 ]; then
else
echo "Test Pass"
exit 0
fi
fi

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -374,10 +374,6 @@ gen_mkfiles()
read_mkfile_config()
{
local index lname
declare -i count
# Read the file describing file suffixes.
src_file_suffixes=$(cat "${suffix_file}")

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -47,8 +47,12 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),gcc)
else
ifeq ($(CC_VENDOR),clang)
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
@@ -77,7 +81,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -47,8 +47,12 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),gcc)
else
ifeq ($(CC_VENDOR),clang)
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
@@ -77,7 +81,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -88,7 +88,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like

View File

@@ -52,17 +52,6 @@
// -- sgemm micro-kernel --
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
#define BLIS_DEFAULT_MC_S 256
@@ -74,6 +63,17 @@
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
@@ -85,17 +85,6 @@
// -- dgemm micro-kernel --
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
#define BLIS_DEFAULT_MC_D 152
@@ -107,6 +96,17 @@
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -88,7 +88,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -43,11 +43,22 @@
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
#ifdef BLIS_NO_HBWMALLOC
#include <stdlib.h>
#define BLIS_MALLOC_POOL malloc
#define BLIS_FREE_POOL free
#else
#include <hbwmalloc.h>
#define BLIS_MALLOC_POOL hbw_malloc
#define BLIS_FREE_POOL hbw_free
#endif
//#define BLIS_MALLOC_INTL hbw_malloc
//#define BLIS_FREE_INTL hbw_free

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -65,6 +65,10 @@ else
COPTFLAGS := -O3
endif
ifeq ($(DEBUG_TYPE),sde)
CPPROCFLAGS += -DBLIS_NO_HBWMALLOC
endif
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
@@ -95,7 +99,16 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -lmemkind
ifneq ($(DEBUG_TYPE),sde)
LDFLAGS := -lmemkind
else
LDFLAGS :=
endif
ifneq ($(CC_VENDOR),icc)
LDFLAGS += -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,11 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifeq ($(CC_VENDOR),icc)
LDFLAGS := -mmic
else
LDFLAGS := -mmic -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -47,8 +47,12 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),gcc)
else
ifeq ($(CC_VENDOR),clang)
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
@@ -77,7 +81,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -63,7 +63,9 @@ ARFLAGS := rcs
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS :=
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif
# --- Determine the finalizer and related flags ---
FINALIZER := pnacl-finalize

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -86,7 +86,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -88,7 +88,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -77,7 +77,9 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif

5
configure vendored
View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
@@ -458,6 +458,9 @@ main()
if [ -n "${debug_flag}" ]; then
if [ "x${debug_type}" = "xopt" ]; then
echo "${script_name}: enabling debug symbols with optimizations."
elif [ "x${debug_type}" = "xsde" ]; then
debug_type='sde'
echo "${script_name}: enabling SDE processor emulation."
else
debug_type='noopt'
echo "${script_name}: enabling debug symbols; optimizations disabled."

View File

@@ -41,7 +41,7 @@
#undef GENFRONT
#define GENFRONT( opname, kertype ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
@@ -68,15 +68,15 @@ GENFRONT( swapv, BLIS_SWAPV_KER )
#undef GENFRONT
#define GENFRONT( opname, kertype, dep1, dep2, dep3, dep4 ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(dep1,_cntx_init)( cntx ); \
PASTEMAC(dep2,_cntx_init)( cntx ); \
PASTEMAC(dep3,_cntx_init)( cntx ); \
PASTEMAC(dep4,_cntx_init)( cntx ); \
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
PASTEMAC(dep2,_cntx_init)( dt, cntx ); \
PASTEMAC(dep3,_cntx_init)( dt, cntx ); \
PASTEMAC(dep4,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -93,12 +93,12 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
#undef GENFRONT
#define GENFRONT( opname, kertype, depname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( cntx ); \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -116,13 +116,13 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv )
#undef GENFRONT
#define GENFRONT( opname, kertype, dep1, dep2 ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(dep1,_cntx_init)( cntx ); \
PASTEMAC(dep2,_cntx_init)( cntx ); \
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
PASTEMAC(dep2,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( addv )

View File

@@ -53,7 +53,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -88,7 +88,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -123,7 +123,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -198,7 +198,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -274,7 +274,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -306,7 +306,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -340,7 +340,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\
@@ -373,7 +373,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
\

View File

@@ -41,12 +41,12 @@
#undef GENFRONT
#define GENFRONT( opname, depname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( cntx ); \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
} \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( addd )

View File

@@ -90,7 +90,7 @@ void PASTEMAC(ch,opname) \
} \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
@@ -166,7 +166,7 @@ void PASTEMAC(ch,opname) \
} \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
@@ -222,7 +222,7 @@ void PASTEMAC(ch,opname) \
x1 = x + offx; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
@@ -276,7 +276,7 @@ void PASTEMAC(ch,opname) \
x1 = x + offx; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
@@ -349,7 +349,7 @@ void PASTEMAC(ch,opname) \
incx = 2*incx; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(chr,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx_p ); \

View File

@@ -41,12 +41,12 @@
#undef GENFRONT
#define GENFRONT( opname, kertype, depname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( cntx ); \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -63,13 +63,13 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
#undef GENFRONT
#define GENFRONT( opname, kertype, depname1, depname2 ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( cntx ); \
PASTEMAC(depname2,_cntx_init)( cntx ); \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -86,12 +86,12 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
#undef GENFRONT
#define GENFRONT( opname, kertype, depname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( cntx ); \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -114,13 +114,13 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
#undef GENFRONT
#define GENFRONT( opname, kertype, depname1, depname2 ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( cntx ); \
PASTEMAC(depname2,_cntx_init)( cntx ); \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( axpy2v )

View File

@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
\
@@ -99,7 +99,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
\
@@ -142,7 +142,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
\
@@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
\
@@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
\

View File

@@ -41,12 +41,12 @@
#undef GENFRONT
#define GENFRONT( opname, depname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( cntx ); \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
} \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
@@ -64,13 +64,13 @@ GENFRONT( subm, subv )
#undef GENFRONT
#define GENFRONT( opname, depname1, depname2 ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( cntx ); \
PASTEMAC(depname2,_cntx_init)( cntx ); \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
} \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( addm )

View File

@@ -91,6 +91,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
);
INSERT_GENTDEF( packm_cxk_ker )
INSERT_GENTDEF( packm_cxk_1er_ker )
// packm_3mis_ker

View File

@@ -54,12 +54,13 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
@@ -118,12 +119,13 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
@@ -187,7 +189,8 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
@@ -195,7 +198,7 @@ void PASTEMAC(ch,opname) \
if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
@@ -256,12 +259,13 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* If alpha is zero, then we set the output matrix to zero. This
seemingly minor optimization is important because it will clear
@@ -344,12 +348,13 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \

View File

@@ -48,9 +48,11 @@
#include "bli_packm_struc_cxk_4mi.h"
#include "bli_packm_struc_cxk_3mis.h"
#include "bli_packm_struc_cxk_rih.h"
#include "bli_packm_struc_cxk_1er.h"
#include "bli_packm_cxk.h"
#include "bli_packm_cxk_4mi.h"
#include "bli_packm_cxk_3mis.h"
#include "bli_packm_cxk_rih.h"
#include "bli_packm_cxk_1er.h"

View File

@@ -90,6 +90,12 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};

View File

@@ -39,7 +39,7 @@
// Define context initialization functions.
//
void bli_packm_cntx_init( cntx_t* cntx )
void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
{
bli_cntx_obj_create( cntx );

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( packm )

View File

@@ -0,0 +1,489 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_cxk_1er_ker_vft
#undef FUNCPTR_ARRAY_LENGTH
#define FUNCPTR_ARRAY_LENGTH 32
static FUNCPTR_T ftypes_e[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] =
{
/* micro-panel width = 0 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 1 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 2 */
{
NULL, BLIS_CPACKM_2XK_1E_KERNEL,
NULL, BLIS_ZPACKM_2XK_1E_KERNEL,
},
/* micro-panel width = 3 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 4 */
{
NULL, BLIS_CPACKM_4XK_1E_KERNEL,
NULL, BLIS_ZPACKM_4XK_1E_KERNEL,
},
/* micro-panel width = 5 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 6 */
{
NULL, BLIS_CPACKM_6XK_1E_KERNEL,
NULL, BLIS_ZPACKM_6XK_1E_KERNEL,
},
/* micro-panel width = 7 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 8 */
{
NULL, BLIS_CPACKM_8XK_1E_KERNEL,
NULL, BLIS_ZPACKM_8XK_1E_KERNEL,
},
/* micro-panel width = 9 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 10 */
{
NULL, BLIS_CPACKM_10XK_1E_KERNEL,
NULL, BLIS_ZPACKM_10XK_1E_KERNEL,
},
/* micro-panel width = 11 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 12 */
{
NULL, BLIS_CPACKM_12XK_1E_KERNEL,
NULL, BLIS_ZPACKM_12XK_1E_KERNEL,
},
/* micro-panel width = 13 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 14 */
{
NULL, BLIS_CPACKM_14XK_1E_KERNEL,
NULL, BLIS_ZPACKM_14XK_1E_KERNEL,
},
/* micro-panel width = 15 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 16 */
{
NULL, BLIS_CPACKM_16XK_1E_KERNEL,
NULL, BLIS_ZPACKM_16XK_1E_KERNEL,
},
/* micro-panel width = 17 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 18 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 19 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 20 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 21 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 22 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 23 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 24 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 25 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 26 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 27 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 28 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 29 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 30 */
{
NULL, BLIS_CPACKM_30XK_1E_KERNEL,
NULL, BLIS_ZPACKM_30XK_1E_KERNEL,
},
/* micro-panel width = 31 */
{
NULL, NULL, NULL, NULL,
},
};
static FUNCPTR_T ftypes_r[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] =
{
/* micro-panel width = 0 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 1 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 2 */
{
NULL, BLIS_CPACKM_2XK_1R_KERNEL,
NULL, BLIS_ZPACKM_2XK_1R_KERNEL,
},
/* micro-panel width = 3 */
{
NULL, BLIS_CPACKM_3XK_1R_KERNEL,
NULL, BLIS_ZPACKM_3XK_1R_KERNEL,
},
/* micro-panel width = 4 */
{
NULL, BLIS_CPACKM_4XK_1R_KERNEL,
NULL, BLIS_ZPACKM_4XK_1R_KERNEL,
},
/* micro-panel width = 5 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 6 */
{
NULL, BLIS_CPACKM_6XK_1R_KERNEL,
NULL, BLIS_ZPACKM_6XK_1R_KERNEL,
},
/* micro-panel width = 7 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 8 */
{
NULL, BLIS_CPACKM_8XK_1R_KERNEL,
NULL, BLIS_ZPACKM_8XK_1R_KERNEL,
},
/* micro-panel width = 9 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 10 */
{
NULL, BLIS_CPACKM_10XK_1R_KERNEL,
NULL, BLIS_ZPACKM_10XK_1R_KERNEL,
},
/* micro-panel width = 11 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 12 */
{
NULL, BLIS_CPACKM_12XK_1R_KERNEL,
NULL, BLIS_ZPACKM_12XK_1R_KERNEL,
},
/* micro-panel width = 13 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 14 */
{
NULL, BLIS_CPACKM_14XK_1R_KERNEL,
NULL, BLIS_ZPACKM_14XK_1R_KERNEL,
},
/* micro-panel width = 15 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 16 */
{
NULL, BLIS_CPACKM_16XK_1R_KERNEL,
NULL, BLIS_ZPACKM_16XK_1R_KERNEL,
},
/* micro-panel width = 17 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 18 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 19 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 20 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 21 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 22 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 23 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 24 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 25 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 26 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 27 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 28 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 29 */
{
NULL, NULL, NULL, NULL,
},
/* micro-panel width = 30 */
{
NULL, BLIS_CPACKM_30XK_1R_KERNEL,
NULL, BLIS_ZPACKM_30XK_1R_KERNEL,
},
/* micro-panel width = 31 */
{
NULL, NULL, NULL, NULL,
},
};
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_len, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
num_t dt; \
FUNCPTR_T f; \
\
/* Acquire the datatype for the current function. */ \
dt = PASTEMAC(ch,type); \
\
/* Index into the array to extract the correct function pointer.
If the micro-panel dimension is too big to be within the array of
explicitly handled kernels, then we treat that kernel the same
as if it were in range but unimplemented. */ \
if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) \
{ \
if ( bli_is_1e_packed( schema ) ) f = ftypes_e[panel_dim][dt]; \
else /*( bli_is_1r_packed( schema ) )*/ f = ftypes_r[panel_dim][dt]; \
} \
else f = NULL; \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
f \
( \
conja, \
panel_len, \
kappa, \
a, inca, lda, \
p, ldp \
); \
} \
else \
{ \
dim_t i, j; \
\
if ( bli_is_1e_packed( schema ) ) \
{ \
\
ctype* restrict kappa_cast = ( ctype* )kappa; \
ctype* restrict a_ri = ( ctype* )a; \
ctype* restrict p_ri = ( ctype* )p; \
ctype* restrict p_ir = ( ctype* )p + ldp/2; \
\
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
/* NOTE: The loops below are inlined versions of scal2m, but
for separated real/imaginary storage. */ \
\
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < panel_len; ++j ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \
ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \
\
PASTEMAC(ch,scal2j1es)( *kappa_cast, \
*alpha11_ri, \
*pi11_ri, \
*pi11_ir ); \
} \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( j = 0; j < panel_len; ++j ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \
ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \
\
PASTEMAC(ch,scal21es)( *kappa_cast, \
*alpha11_ri, \
*pi11_ri, \
*pi11_ir ); \
} \
} \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
ctype_r* restrict kappa_r = ( ctype_r* )kappa; \
ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \
ctype_r* restrict a_r = ( ctype_r* )a; \
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
ctype_r* restrict p_r = ( ctype_r* )p; \
ctype_r* restrict p_i = ( ctype_r* )p + ldp; \
const dim_t inca2 = 2*inca; \
const dim_t lda2 = 2*lda; \
const dim_t ldp2 = 2*ldp; \
\
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
/* NOTE: The loops below are inlined versions of scal2m, but
for separated real/imaginary storage. */ \
\
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < panel_len; ++j ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \
\
PASTEMAC(ch,scal2jris)( *kappa_r, \
*kappa_i, \
*alpha11_r, \
*alpha11_i, \
*pi11_r, \
*pi11_i ); \
} \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( j = 0; j < panel_len; ++j ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \
\
PASTEMAC(ch,scal2ris)( *kappa_r, \
*kappa_i, \
*alpha11_r, \
*alpha11_i, \
*pi11_r, \
*pi11_i ); \
} \
} \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er )

View File

@@ -0,0 +1,55 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_packm_cxk_1e_ref.h"
#include "bli_packm_cxk_1r_ref.h"
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_len, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC( packm_cxk_1er )

View File

@@ -121,11 +121,11 @@ siz_t bli_packm_init
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
{
schema = bli_cntx_get_pack_schema_a( cntx );
schema = bli_cntx_get_pack_schema_a_block( cntx );
}
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
{
schema = bli_cntx_get_pack_schema_b( cntx );
schema = bli_cntx_get_pack_schema_b_panel( cntx );
}
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{

View File

@@ -0,0 +1,610 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_1er) \
( \
strucc, \
diagoffc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_1er) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
); \
} \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
{ \
if ( m_panel != m_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t offm = m_panel; \
dim_t offn = 0; \
dim_t m_edge = m_panel_max - m_panel; \
dim_t n_edge = n_panel_max; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, rs_p, cs_p, ldp \
); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t offm = 0; \
dim_t offn = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - n_panel; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, rs_p, cs_p, ldp \
); \
} \
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this micro-panel is an edge case in both panel dimension and
length, then it must be a bottom-right corner case, which
typically only happens for micro-panels being packed for trsm.
(It also happens for trmm if kr > 1.) Here, we set the part of
the diagonal that extends into the zero-padded region to
identity. This prevents NaNs and Infs from creeping into the
computation. If this code does execute for trmm, it is okay,
because those 1.0's that extend into the bottom-right region
end up getting muliplied by the 0.0's in the zero-padded region
of the other matrix. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t offm = m_panel; \
dim_t offn = n_panel; \
dim_t m_edge = m_panel_max - m_panel; \
dim_t n_edge = n_panel_max - n_panel; \
\
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
one, \
p, rs_p, cs_p, ldp \
); \
} \
} \
} \
\
\
/*
if ( bli_is_1r_packed( schema ) ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
} \
\
if ( bli_is_1e_packed( schema ) ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc_abs; \
dim_t j; \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( incc, ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t j = diagoffc_abs; \
ctype* restrict c11 = c + (j )*ldc; \
ctype* restrict p11 = p + (j )*ldp; \
\
PASTEMAC(ch,scal21ms_mxn_uplo) \
( \
schema, \
uploc, \
conjc, \
panel_dim, \
kappa, \
c11, rs_c, cs_c, \
p11, rs_p, cs_p, ldp \
); \
\
/* If we are packing a micro-panel with Hermitian structure,
we must take special care of the diagonal. Now, if kappa
were guaranteed to be unit, all we would need to do is
explicitly zero out the imaginary part of the diagonal of
p11, in case the diagonal of the source matrix contained
garbage (non-zero) imaginary values. HOWEVER, since kappa
can be non-unit, things become a little more complicated.
In general, we must re-apply the kappa scalar to ONLY the
real part of the diagonal of the source matrix and save
the result to the diagonal of p11. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
ctype_r* restrict c11_r = ( ctype_r* )c11; \
const dim_t rs_c2 = 2*rs_c; \
const dim_t cs_c2 = 2*cs_c; \
\
PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
( \
schema, \
panel_dim, \
panel_dim, \
kappa, \
c11_r, rs_c2, cs_c2, \
p11, rs_p, cs_p, ldp \
); \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
) \
{ \
doff_t diagoffp_abs = bli_abs( diagoffp ); \
ctype* p11 = p + (diagoffp_abs )*ldp; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
0, \
0, \
panel_dim, \
panel_dim, \
kappa, \
p11, rs_p, cs_p, ldp \
); \
} \
\
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invert1ms_mxn_diag) \
( \
schema, \
0, \
0, \
panel_dim, \
panel_dim, \
p11, rs_p, cs_p, ldp \
); \
} \
\
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
doff_t diagoffp11_0 = 0; \
dim_t p11_0_dim = panel_dim - 1; \
\
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp11_0 ); \
\
/* Note that this macro works a little differently than the setm
operation. Here, we pass in the dimensions of only p11, rather
than the whole micro-panel, and furthermore we pass in the
"shrunken" dimensions of p11, corresponding to the toggling
and shrinking of the diagonal above. The macro will do the
right thing, incrementing the pointer to p11 by the appropriate
leading dimension (cs_p or rs_p), and setting only the lower
or upper triangle to zero. */ \
PASTEMAC(ch,set1ms_mxn_uplo) \
( \
schema, \
diagoffp11_0, \
uplop, \
p11_0_dim, \
p11_0_dim, \
zero, \
p11, rs_p, cs_p, ldp \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )

View File

@@ -0,0 +1,117 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_1er )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_1er )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_1er )

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,62 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Redefine level-1m kernel API names to induce prototypes.
#undef packm_2xk_ker_name
#define packm_2xk_ker_name packm_2xk_1e_ref
// 1e format should probably never have an odd-numbered register blocking.
//#undef packm_3xk_ker_name
//#define packm_3xk_ker_name packm_3xk_1e_ref
#undef packm_4xk_ker_name
#define packm_4xk_ker_name packm_4xk_1e_ref
#undef packm_6xk_ker_name
#define packm_6xk_ker_name packm_6xk_1e_ref
#undef packm_8xk_ker_name
#define packm_8xk_ker_name packm_8xk_1e_ref
#undef packm_10xk_ker_name
#define packm_10xk_ker_name packm_10xk_1e_ref
#undef packm_12xk_ker_name
#define packm_12xk_ker_name packm_12xk_1e_ref
#undef packm_14xk_ker_name
#define packm_14xk_ker_name packm_14xk_1e_ref
#undef packm_16xk_ker_name
#define packm_16xk_ker_name packm_16xk_1e_ref
#undef packm_30xk_ker_name
#define packm_30xk_ker_name packm_30xk_1e_ref
// Include the level-1m kernel API template.
#include "bli_l1m_ker.h"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,61 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Redefine level-1m kernel API names to induce prototypes.
#undef packm_2xk_ker_name
#define packm_2xk_ker_name packm_2xk_1r_ref
#undef packm_3xk_ker_name
#define packm_3xk_ker_name packm_3xk_1r_ref
#undef packm_4xk_ker_name
#define packm_4xk_ker_name packm_4xk_1r_ref
#undef packm_6xk_ker_name
#define packm_6xk_ker_name packm_6xk_1r_ref
#undef packm_8xk_ker_name
#define packm_8xk_ker_name packm_8xk_1r_ref
#undef packm_10xk_ker_name
#define packm_10xk_ker_name packm_10xk_1r_ref
#undef packm_12xk_ker_name
#define packm_12xk_ker_name packm_12xk_1r_ref
#undef packm_14xk_ker_name
#define packm_14xk_ker_name packm_14xk_1r_ref
#undef packm_16xk_ker_name
#define packm_16xk_ker_name packm_16xk_1r_ref
#undef packm_30xk_ker_name
#define packm_30xk_ker_name packm_30xk_1r_ref
// Include the level-1m kernel API template.
#include "bli_l1m_ker.h"

View File

@@ -41,7 +41,7 @@
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
@@ -50,20 +50,20 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
operation. */ \
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \
bli_axpyf_cntx_init( cntx ); \
bli_dotxf_cntx_init( cntx ); \
bli_axpyf_cntx_init( dt, cntx ); \
bli_dotxf_cntx_init( dt, cntx ); \
\
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \
bli_axpyv_cntx_init( cntx ); \
bli_dotxv_cntx_init( cntx ); \
bli_scalv_cntx_init( cntx ); \
bli_setv_cntx_init( cntx ); \
bli_axpyv_cntx_init( dt, cntx ); \
bli_dotxv_cntx_init( dt, cntx ); \
bli_scalv_cntx_init( dt, cntx ); \
bli_setv_cntx_init( dt, cntx ); \
\
/* Initialize the context with packm-related kernels. */ \
bli_packm_cntx_init( cntx ); \
bli_packm_cntx_init( dt, cntx ); \
\
/* Set the register and cache blocksizes and multiples, as well
as the execution method. */ \
@@ -88,7 +88,7 @@ GENFRONT( trsv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
@@ -96,10 +96,10 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
/* Initialize the context with kernels employed by the current
operation. */ \
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
bli_axpyv_cntx_init( cntx ); \
bli_axpyv_cntx_init( dt, cntx ); \
\
/* Initialize the context with packm-related kernels. */ \
bli_packm_cntx_init( cntx ); \
bli_packm_cntx_init( dt, cntx ); \
\
/* Set the register and cache blocksizes and multiples, as well
as the execution method. */ \
@@ -122,7 +122,7 @@ GENFRONT( syr )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
@@ -133,22 +133,22 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXAXPYF_KER, cntx );*/ \
bli_dotaxpyv_cntx_init( cntx ); \
bli_axpyf_cntx_init( cntx ); \
bli_dotxf_cntx_init( cntx ); \
bli_dotxaxpyf_cntx_init( cntx ); \
bli_dotaxpyv_cntx_init( dt, cntx ); \
bli_axpyf_cntx_init( dt, cntx ); \
bli_dotxf_cntx_init( dt, cntx ); \
bli_dotxaxpyf_cntx_init( dt, cntx ); \
\
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \
bli_axpyv_cntx_init( cntx ); \
bli_dotxv_cntx_init( cntx ); \
bli_scalv_cntx_init( cntx ); \
bli_setv_cntx_init( cntx ); \
bli_axpyv_cntx_init( dt, cntx ); \
bli_dotxv_cntx_init( dt, cntx ); \
bli_scalv_cntx_init( dt, cntx ); \
bli_setv_cntx_init( dt, cntx ); \
\
/* Initialize the context with packm-related kernels. */ \
bli_packm_cntx_init( cntx ); \
bli_packm_cntx_init( dt, cntx ); \
\
/* Set the register and cache blocksizes and multiples, as well
as the execution method. */ \
@@ -173,7 +173,7 @@ GENFRONT( symv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
@@ -182,11 +182,11 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
operation. */ \
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPY2V_KER, cntx );*/ \
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
bli_axpy2v_cntx_init( cntx ); \
bli_axpyv_cntx_init( cntx ); \
bli_axpy2v_cntx_init( dt, cntx ); \
bli_axpyv_cntx_init( dt, cntx ); \
\
/* Initialize the context with packm-related kernels. */ \
bli_packm_cntx_init( cntx ); \
bli_packm_cntx_init( dt, cntx ); \
\
/* Set the register and cache blocksizes and multiples, as well
as the execution method. */ \

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( gemv )

View File

@@ -55,8 +55,9 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
dim_t m_y, n_x; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
dim_t m_y, n_x; \
\
/* Determine the dimensions of y and x. */ \
bli_set_dims_with_trans( transa, m, n, m_y, n_x ); \
@@ -65,7 +66,7 @@ void PASTEMAC(ch,opname) \
if ( bli_zero_dim1( m_y ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* If x has zero elements, or if alpha is zero, scale y by beta and
return early. */ \
@@ -135,13 +136,14 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
/* If x or y has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_ft) f; \
@@ -188,10 +190,11 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* If x has zero elements, or if alpha is zero, scale y by beta and
return early. */ \
@@ -261,8 +264,9 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
ctype alpha_local; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
ctype alpha_local; \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \
@@ -273,7 +277,7 @@ void PASTEMAC(ch,opname) \
PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_ft) f; \
@@ -324,13 +328,14 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_ft) f; \
@@ -383,13 +388,14 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_ft) f; \
@@ -444,10 +450,11 @@ void PASTEMAC(ch,opname) \
cntx_t* cntx \
) \
{ \
cntx_t* cntx_p; \
const num_t dt = PASTEMAC(ch,type); \
cntx_t* cntx_p; \
\
/* Initialize a local context if the given context is NULL. */ \
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
\
/* If x has zero elements, return early. */ \
if ( bli_zero_dim1( m ) ) return; \

View File

@@ -70,8 +70,8 @@ void bli_l3_cntl_create_if
else
{
// If the user provided a control tree, create a copy and use it
// instead (so that it can be used to cache things like pack mem_t
// entries).
// instead (so that threads can use its local tree as a place to
// cache things like pack mem_t entries).
*cntl_use = bli_cntl_copy( cntl_orig );
}
}

View File

@@ -38,7 +38,7 @@
// Define context initialization functions.
//
void bli_gemm_cntx_init( cntx_t* cntx )
void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
{
// Clear the context fields.
bli_cntx_obj_clear( cntx );
@@ -49,7 +49,7 @@ void bli_gemm_cntx_init( cntx_t* cntx )
bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx );
// Initialize the context with packm-related kernels.
bli_packm_cntx_init( cntx );
bli_packm_cntx_init( dt, cntx );
// Initialize the context with the current architecture's register
// and cache blocksizes (and multiples), given the execution method.
@@ -63,9 +63,8 @@ void bli_gemm_cntx_init( cntx_t* cntx )
cntx );
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
}
void bli_gemm_cntx_finalize( cntx_t* cntx )
@@ -74,7 +73,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx )
// -----------------------------------------------------------------------------
void bli_trsm_cntx_init( cntx_t* cntx )
void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
{
// Clear the context fields.
bli_cntx_obj_clear( cntx );
@@ -92,7 +91,7 @@ void bli_trsm_cntx_init( cntx_t* cntx )
bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx );
// Initialize the context with packm-related kernels.
bli_packm_cntx_init( cntx );
bli_packm_cntx_init( dt, cntx );
// Initialize the context with the current architecture's register
// and cache blocksizes (and multiples), given the execution method.
@@ -106,9 +105,8 @@ void bli_trsm_cntx_init( cntx_t* cntx )
cntx );
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS,
cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
}
void bli_trsm_cntx_finalize( cntx_t* cntx )

View File

@@ -40,7 +40,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
GENPROT( gemm )

View File

@@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var2;
return bli_gemmbp_cntl_create( family );
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var2;
// Change the macro-kernel if the operation family is herk or trmm.
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
@@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
(
bli_gemm_packa,
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
BLIS_MR,
BLIS_KR,
@@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create
// Create a node for packing matrix B.
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
(
bli_gemm_packb,
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_KR,
BLIS_NR,
@@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create
return gemm_cntl_vl_mm;
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemmpb_cntl_create
(
opid_t family
)
{
void* macro_kernel_p = bli_gemm_ker_var1;
// Change the macro-kernel if the operation family is herk or trmm.
//if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
(
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
(
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_ub_ke
);
// Create a node for packing matrix A (which is really the right-hand
// operand "B").
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_KR,
BLIS_MR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_pb_ub
);
// Create a node for partitioning the n dimension by MC.
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
(
BLIS_MC,
bli_gemm_blk_var2,
gemm_cntl_packb
);
// Create a node for packing matrix B (which is really the left-hand
// operand "A").
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
BLIS_NR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
gemm_cntl_op_pb
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
(
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packa
);
// Create a node for partitioning the m dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
(
BLIS_NC,
bli_gemm_blk_var1,
gemm_cntl_mm_op
);
return gemm_cntl_vl_mm;
}
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
cntl_t* cntl,

View File

@@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create
opid_t family
);
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
opid_t family
);
cntl_t* bli_gemmpb_cntl_create
(
opid_t family
);
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
cntl_t* cntl,

View File

@@ -112,5 +112,6 @@ void bli_gemm_front
cntl
);
}
}

View File

@@ -0,0 +1,56 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_ker_var1
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Implement _ker_var1() in terms of _ker_var2() by transposing the
// entire suboperation (which also requires swapping A and B).
bli_obj_induce_trans( *a );
bli_obj_induce_trans( *b );
bli_obj_induce_trans( *c );
bli_gemm_ker_var2( b, a, c, cntx, cntl, thread );
}

View File

@@ -109,6 +109,26 @@ void bli_gemm_ker_var2
buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( *c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
#if 1
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];

View File

@@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 )
GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )
// Headers for induced algorithms:

View File

@@ -85,6 +85,7 @@ void bli_blksz_obj_free
// -----------------------------------------------------------------------------
#if 0
void bli_blksz_reduce_dt_to
(
num_t dt_bm, blksz_t* bmult,
@@ -116,6 +117,66 @@ void bli_blksz_reduce_dt_to
bli_blksz_set_def( blksz_def, dt_bs, blksz );
bli_blksz_set_max( blksz_max, dt_bs, blksz );
}
#endif
// -----------------------------------------------------------------------------
void bli_blksz_reduce_def_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
)
{
dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz );
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
// If the blocksize multiple is zero, we do nothing.
if ( bmult_val == 0 ) return;
// Round the default and maximum blocksize values down to their
// respective nearest multiples of bmult_val. (Notice that we
// ignore the "max" entry in the bmult object since that would
// correspond to the packing dimension, which plays no role
// as a blocksize multiple.)
blksz_def = ( blksz_def / bmult_val ) * bmult_val;
// Make sure the new blocksize values are at least the blocksize
// multiple.
if ( blksz_def == 0 ) blksz_def = bmult_val;
// Store the new blocksizes back to the object.
bli_blksz_set_def( blksz_def, dt_bs, blksz );
}
// -----------------------------------------------------------------------------
void bli_blksz_reduce_max_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
)
{
dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz );
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
// If the blocksize multiple is zero, we do nothing.
if ( bmult_val == 0 ) return;
// Round the blocksize values down to its nearest multiple of
// of bmult_val. (Notice that we ignore the "max" entry in the
// bmult object since that would correspond to the packing
// dimension, which plays no role as a blocksize multiple.)
blksz_max = ( blksz_max / bmult_val ) * bmult_val;
// Make sure the new blocksize value is at least the blocksize
// multiple.
if ( blksz_max == 0 ) blksz_max = bmult_val;
// Store the new blocksize back to the object.
bli_blksz_set_max( blksz_max, dt_bs, blksz );
}
// -----------------------------------------------------------------------------

View File

@@ -89,11 +89,23 @@
(b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \
}
#define bli_blksz_scale_def( num, den, dt, b ) \
{ \
(b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \
}
#define bli_blksz_scale_max( num, den, dt, b ) \
{ \
(b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \
}
#if 0
#define bli_blksz_scale_dt_by( num, den, dt, b ) \
{ \
(b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \
(b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \
}
#endif
// -----------------------------------------------------------------------------
@@ -121,12 +133,25 @@ void bli_blksz_obj_free
// -----------------------------------------------------------------------------
#if 0
void bli_blksz_reduce_dt_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
#endif
void bli_blksz_reduce_def_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
void bli_blksz_reduce_max_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
// -----------------------------------------------------------------------------
dim_t bli_determine_blocksize

View File

@@ -97,6 +97,16 @@ void bli_cntl_free
cntl_t* cntl,
thrinfo_t* thread
)
{
if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread );
else bli_cntl_free_wo_thrinfo( cntl );
}
void bli_cntl_free_w_thrinfo
(
cntl_t* cntl,
thrinfo_t* thread
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
@@ -112,7 +122,7 @@ void bli_cntl_free
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free( cntl_sub_node, thread_sub_node );
bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node );
}
// Free the current node's params field, if it is non-NULL.
@@ -122,8 +132,8 @@ void bli_cntl_free
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the current thread
// is chief for its group, and only if the mem_t is allocated.
// broker from which it originated, but only if the mem_t entry is
// allocated, and only if the current thread is chief for its group.
if ( bli_thread_am_ochief( thread ) )
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
@@ -134,6 +144,42 @@ void bli_cntl_free
bli_cntl_obj_free( cntl );
}
void bli_cntl_free_wo_thrinfo
(
cntl_t* cntl
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
void* cntl_params = bli_cntl_params( cntl );
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free_wo_thrinfo( cntl_sub_node );
}
// Free the current node's params field, if it is non-NULL.
if ( cntl_params != NULL )
{
bli_free_intl( cntl_params );
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the mem_t entry is
// allocated.
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
bli_membrk_release( cntl_pack_mem );
}
// Free the current node.
bli_cntl_obj_free( cntl );
}
// -----------------------------------------------------------------------------
cntl_t* bli_cntl_copy

View File

@@ -75,12 +75,25 @@ void bli_cntl_obj_clear
cntl_t* cntl
);
// -----------------------------------------------------------------------------
void bli_cntl_free
(
cntl_t* cntl,
thrinfo_t* thread
);
void bli_cntl_free_w_thrinfo
(
cntl_t* cntl,
thrinfo_t* thread
);
void bli_cntl_free_wo_thrinfo
(
cntl_t* cntl
);
cntl_t* bli_cntl_copy
(
cntl_t* cntl

View File

@@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx )
return bli_cntx_method( cntx );
}
pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx )
pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx )
{
return bli_cntx_schema_a( cntx );
return bli_cntx_schema_a_block( cntx );
}
pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx )
{
return bli_cntx_schema_b( cntx );
return bli_cntx_schema_b_panel( cntx );
}
pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx )
{
return bli_cntx_schema_c_panel( cntx );
}
bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx )
{
return bli_cntx_anti_pref( cntx );
}
#endif
@@ -386,27 +396,27 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
/* Example prototypes:
void
bli_cntx_set_blkszs(
void bli_cntx_set_blkszs
(
ind_t method = BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
...
cntx_t* cntx
);
ind_t method = BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
...
cntx_t* cntx );
void
bli_cntx_set_blkszs(
ind_t method != BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t scalr0,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t scalr1,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t scalr2,
...
cntx_t* cntx );
void bli_cntx_set_blkszs
(
ind_t method != BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
@@ -414,7 +424,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
bszid_t* bszids;
blksz_t** blkszs;
bszid_t* bmults;
dim_t* scalrs;
double* dsclrs;
double* msclrs;
cntx_t* cntx;
@@ -426,7 +437,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) );
bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
scalrs = bli_malloc_intl( n_bs * sizeof( dim_t ) );
dsclrs = bli_malloc_intl( n_bs * sizeof( double ) );
msclrs = bli_malloc_intl( n_bs * sizeof( double ) );
// -- Begin variable argument section --
@@ -444,9 +456,9 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// - the address of the blksz_t object, and
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
const bszid_t bs_id = va_arg( args, bszid_t );
blksz_t* blksz = va_arg( args, blksz_t* );
const bszid_t bm_id = va_arg( args, bszid_t );
bszid_t bs_id = va_arg( args, bszid_t );
blksz_t* blksz = va_arg( args, blksz_t* );
bszid_t bm_id = va_arg( args, bszid_t );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
@@ -464,18 +476,21 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// - the address of the blksz_t object, and
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
// - the scalar we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes.
const bszid_t bs_id = va_arg( args, bszid_t );
blksz_t* blksz = va_arg( args, blksz_t* );
const bszid_t bm_id = va_arg( args, bszid_t );
const dim_t scalr = va_arg( args, dim_t );
// - the scalars we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes (for default
// and maximum blocksizes).
bszid_t bs_id = va_arg( args, bszid_t );
blksz_t* blksz = va_arg( args, blksz_t* );
bszid_t bm_id = va_arg( args, bszid_t );
double dsclr = va_arg( args, double );
double msclr = va_arg( args, double );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
blkszs[ i ] = blksz;
bmults[ i ] = bm_id;
scalrs[ i ] = scalr;
dsclrs[ i ] = dsclr;
msclrs[ i ] = msclr;
}
}
@@ -510,12 +525,12 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
const bszid_t bs_id = bszids[ i ];
const bszid_t bm_id = bmults[ i ];
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Copy the blksz_t object contents into the appropriate
// location within the context's blksz_t array. Do the same
@@ -534,14 +549,15 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
// Read the current blocksize id, blksz_t pointer, blocksize
// multiple id, and blocksize scalar.
const bszid_t bs_id = bszids[ i ];
const bszid_t bm_id = bmults[ i ];
const dim_t scalr = scalrs[ i ];
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
double dsclr = dsclrs[ i ];
double msclr = msclrs[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* bmult = blkszs[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* bmult = blkszs[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Copy the real domain values of the source blksz_t object into
// the context, duplicating into the complex domain fields.
@@ -550,20 +566,50 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
// The next steps apply only to cache blocksizes, and not register
// blocksizes (ie: they only apply to blocksizes for which the
// blocksize multiple id is different than the blocksize id) and
// only when the scalar provided is non-unit.
if ( bs_id != bm_id && scalr != 1 )
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the complex domain values in the blocksize object.
bli_blksz_scale_dt_by( 1, scalr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_dt_by( 1, scalr, BLIS_DCOMPLEX, cntx_blksz );
// Scale the complex domain default blocksize values in the
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
// Finally, round the newly-scaled blocksizes down to their
// respective multiples.
bli_blksz_reduce_dt_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_dt_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
if ( bs_id != bm_id )
{
// Round the newly-scaled blocksizes down to their multiple.
// (Note that both the default and maximum blocksize values
// must be a multiple of the same blocksize multiple.) Also,
// note that this is only done when the blocksize id is not
// equal to the blocksize multiple id (ie: we don't round
// down scaled register blocksizes since they are their own
// multiples).
bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the complex domain maximum blocksize values in the
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
if ( bs_id != bm_id )
{
// Round the newly-scaled blocksizes down to their multiple.
// (Note that both the default and maximum blocksize values
// must be a multiple of the same blocksize multiple.) Also,
// note that this is only done when the blocksize id is not
// equal to the blocksize multiple id (ie: we don't round
// down scaled register blocksizes since they are their own
// multiples).
bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Copy the blocksize multiple id into the context.
@@ -575,7 +621,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
bli_free_intl( blkszs );
bli_free_intl( bszids );
bli_free_intl( bmults );
bli_free_intl( scalrs );
bli_free_intl( dsclrs );
bli_free_intl( msclrs );
}
#endif
@@ -668,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method,
bli_cntx_set_method( method, cntx );
}
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx )
{
bli_cntx_set_schema_a( schema_a, cntx );
bli_cntx_set_schema_b( schema_b, cntx );
bli_cntx_set_schema_a_block( schema_a, cntx );
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_a( pack_t schema_a,
cntx_t* cntx )
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
cntx_t* cntx )
{
bli_cntx_set_schema_a( schema_a, cntx );
bli_cntx_set_schema_a_block( schema_a, cntx );
}
void bli_cntx_set_pack_schema_b( pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
cntx_t* cntx )
{
bli_cntx_set_schema_b( schema_b, cntx );
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_c( pack_t schema_c,
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
cntx_t* cntx )
{
bli_cntx_set_schema_c_panel( schema_c, cntx );
}
#if 0
void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
cntx_t* cntx )
{
bli_cntx_set_schema_c( schema_c, cntx );
bli_cntx_set_anti_pref( anti_pref, cntx );
}
#endif
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
dim_t m, dim_t n, dim_t k )
@@ -729,12 +784,20 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
}
}
jc = bli_env_read_nway( "BLIS_JC_NT", jc );
//pc = bli_env_read_nway( "BLIS_KC_NT", 1 );
pc = 1;
ic = bli_env_read_nway( "BLIS_IC_NT", ic );
jr = bli_env_read_nway( "BLIS_JR_NT", jr );
ir = bli_env_read_nway( "BLIS_IR_NT", ir );
pc = 1;
dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 );
dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 );
dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 );
dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 );
if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1)
{
jc = (jc_env == -1 ? 1 : jc_env);
ic = (ic_env == -1 ? 1 : ic_env);
jr = (jr_env == -1 ? 1 : jr_env);
ir = (ir_env == -1 ? 1 : ir_env);
}
#else
@@ -867,6 +930,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
// -----------------------------------------------------------------------------
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
@@ -916,6 +1005,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
{
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
// If the anti-preference is set, negate the result.
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
return r_val;
}
// -----------------------------------------------------------------------------
void bli_cntx_print( cntx_t* cntx )

View File

@@ -59,6 +59,8 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
bool_t anti_pref;
dim_t* thrloop;
membrk_t* membrk;
@@ -113,26 +115,30 @@ typedef struct cntx_s
\
( (cntx)->method )
#define bli_cntx_schema_a( cntx ) \
#define bli_cntx_schema_a_block( cntx ) \
\
( (cntx)->schema_a )
( (cntx)->schema_a_block )
#define bli_cntx_schema_b( cntx ) \
#define bli_cntx_schema_b_panel( cntx ) \
\
( (cntx)->schema_b )
( (cntx)->schema_b_panel )
#define bli_cntx_schema_c( cntx ) \
#define bli_cntx_schema_c_panel( cntx ) \
\
( (cntx)->schema_c )
( (cntx)->schema_c_panel )
#define bli_cntx_membrk( cntx ) \
#define bli_cntx_anti_pref( cntx ) \
\
( (cntx)->membrk )
( (cntx)->anti_pref )
#define bli_cntx_thrloop( cntx ) \
\
( (cntx)->thrloop )
#define bli_cntx_membrk( cntx ) \
\
( (cntx)->membrk )
#if 1
#define bli_cntx_jc_way( cntx ) \
\
@@ -211,24 +217,24 @@ typedef struct cntx_s
(cntx_p)->method = _method; \
}
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \
{ \
(cntx_p)->schema_a = _schema_a; \
(cntx_p)->schema_a_block = _schema_a_block; \
}
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \
{ \
(cntx_p)->schema_b = _schema_b; \
(cntx_p)->schema_b_panel = _schema_b_panel; \
}
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \
{ \
(cntx_p)->schema_c = _schema_c; \
(cntx_p)->schema_c_panel = _schema_c_panel; \
}
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \
{ \
(cntx_p)->membrk = _membrk; \
(cntx_p)->anti_pref = _anti_pref; \
}
#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
@@ -241,6 +247,11 @@ typedef struct cntx_s
(cntx_p)->thrloop[ BLIS_KR ] = 1; \
}
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
{ \
(cntx_p)->membrk = _membrk; \
}
// cntx_t query (complex)
#define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
@@ -323,13 +334,17 @@ typedef struct cntx_s
\
bli_cntx_method( cntx )
#define bli_cntx_get_pack_schema_a( cntx ) \
#define bli_cntx_get_pack_schema_a_block( cntx ) \
\
bli_cntx_schema_a( cntx )
bli_cntx_schema_a_block( cntx )
#define bli_cntx_get_pack_schema_b( cntx ) \
#define bli_cntx_get_pack_schema_b_panel( cntx ) \
\
bli_cntx_schema_b( cntx )
bli_cntx_schema_b_panel( cntx )
#define bli_cntx_get_pack_schema_c_panel( cntx ) \
\
bli_cntx_schema_c_panel( cntx )
#define bli_cntx_get_membrk( cntx ) \
\
@@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
// l1vkr_t ker_id,
// cntx_t* cntx );
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx );
//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx );
dim_t bli_cntx_get_num_threads( cntx_t* cntx );
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );
@@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func,
cntx_t* cntx );
void bli_cntx_set_ind_method( ind_t method,
cntx_t* cntx );
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_a( pack_t schema_a,
cntx_t* cntx );
void bli_cntx_set_pack_schema_b( pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_c( pack_t schema_c,
cntx_t* cntx );
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
cntx_t* cntx );
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
cntx_t* cntx );
//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
// cntx_t* cntx );
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
side_t side,
cntx_t* cntx,
@@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx );
@@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx );
// print function
@@ -488,13 +518,13 @@ void bli_cntx_print( cntx_t* cntx );
// pointer is NULL. When initializing, the context address that should
// be used (local or external) is assigned to cntx_p.
#define bli_cntx_init_local_if( opname, cntx, cntx_p ) \
#define bli_cntx_init_local_if( opname, dt, cntx, cntx_p ) \
\
cntx_t _cntx_l; \
\
if ( bli_is_null( cntx ) ) \
{ \
PASTEMAC(opname,_cntx_init)( &_cntx_l ); \
PASTEMAC(opname,_cntx_init)( dt, &_cntx_l ); \
cntx_p = &_cntx_l; \
} \
else \
@@ -510,13 +540,13 @@ void bli_cntx_print( cntx_t* cntx );
}
#define bli_cntx_init_local_if2( opname, suf, cntx, cntx_p ) \
#define bli_cntx_init_local_if2( opname, suf, dt, cntx, cntx_p ) \
\
cntx_t _cntx_l; \
\
if ( bli_is_null( cntx ) ) \
{ \
PASTEMAC2(opname,suf,_cntx_init)( &_cntx_l ); \
PASTEMAC2(opname,suf,_cntx_init)( dt, &_cntx_l ); \
cntx_p = &_cntx_l; \
} \
else \

View File

@@ -94,48 +94,47 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
/* Example prototypes:
void
bli_gks_cntx_set_blkszs(
void bli_gks_cntx_set_blkszs
(
ind_t method = BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, bszid_t bm0_id,
bszid_t bs1_id, bszid_t bm1_id,
bszid_t bs2_id, bszid_t bm2_id,
...
cntx_t* cntx
);
ind_t method = BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, bszid_t bm0_id,
bszid_t bs1_id, bszid_t bm1_id,
bszid_t bs2_id, bszid_t bm2_id,
...
cntx_t* cntx );
void
bli_gks_cntx_set_blkszs(
ind_t method != BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, bszid_t bm0_id, dim_t scalr0,
bszid_t bs1_id, bszid_t bm1_id, dim_t scalr1,
bszid_t bs2_id, bszid_t bm2_id, dim_t scalr2,
...
cntx_t* cntx );
void bli_gks_cntx_set_blkszs
(
ind_t method != BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
bszid_t bs2_id, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
bszid_t* bszids;
bszid_t* bmults;
double* scalrs;
double* dsclrs;
double* msclrs;
cntx_t* cntx;
blksz_t* cntx_blkszs;
bszid_t* cntx_bmults;
bszid_t bs_id;
bszid_t bm_id;
double scalr;
// Allocate some temporary local arrays.
bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
scalrs = bli_malloc_intl( n_bs * sizeof( double ) );
dsclrs = bli_malloc_intl( n_bs * sizeof( double ) );
msclrs = bli_malloc_intl( n_bs * sizeof( double ) );
// -- Begin variable argument section --
@@ -152,8 +151,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// - the bszid_t of the blocksize we're about to process,
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
bs_id = va_arg( args, bszid_t );
bm_id = va_arg( args, bszid_t );
bszid_t bs_id = va_arg( args, bszid_t );
bszid_t bm_id = va_arg( args, bszid_t );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
@@ -169,16 +168,19 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// - the bszid_t of the blocksize we're about to process,
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
// - the scalar we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes.
bs_id = va_arg( args, bszid_t );
bm_id = va_arg( args, bszid_t );
scalr = va_arg( args, double );
// - the scalars we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes (for default
// and maximum blocksizes).
bszid_t bs_id = va_arg( args, bszid_t );
bszid_t bm_id = va_arg( args, bszid_t );
double dsclr = va_arg( args, double );
double msclr = va_arg( args, double );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
bmults[ i ] = bm_id;
scalrs[ i ] = scalr;
dsclrs[ i ] = dsclr;
msclrs[ i ] = msclr;
}
}
@@ -210,10 +212,10 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blocksize multiple id.
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Query the blocksizes (blksz_t) associated with bs_id and save
// them directly into the appropriate location in the context's
@@ -231,41 +233,75 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
// Read the current blocksize id, blocksize multiple id,
// and blocksize scalar.
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
double scalr = scalrs[ i ];
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
double dsclr = dsclrs[ i ];
double msclr = msclrs[ i ];
blksz_t blksz;
blksz_t bmult;
blksz_t blksz_l;
blksz_t bmult_l;
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
blksz_t* blksz = &blksz_l;
blksz_t* bmult = &bmult_l;
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Query the blocksizes (blksz_t) associated with bs_id and bm_id
// and use them to populate a pair of local blksz_t objects.
bli_gks_get_blksz( bs_id, &blksz );
bli_gks_get_blksz( bm_id, &bmult );
bli_gks_get_blksz( bs_id, blksz );
bli_gks_get_blksz( bm_id, bmult );
// Copy the real domain values of the source blksz_t object into
// the context, duplicating into the complex domain fields.
bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_FLOAT, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DOUBLE, cntx_blksz );
bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz );
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
// The next steps apply only to cache blocksizes, and not register
// blocksizes (ie: they only apply to blocksizes for which the
// blocksize multiple id is different than the blocksize id) and
// only when the scalar provided is non-unit.
if ( bs_id != bm_id && scalr != 1.0 )
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the complex domain values in the blocksize object.
bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_DCOMPLEX, cntx_blksz );
// Scale the complex domain default blocksize values in the
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
// Finally, round the newly-scaled blocksizes down to their
// respective multiples.
bli_blksz_reduce_dt_to( BLIS_FLOAT, &bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_dt_to( BLIS_DOUBLE, &bmult, BLIS_DCOMPLEX, cntx_blksz );
if ( bs_id != bm_id )
{
// Round the newly-scaled blocksizes down to their multiple.
// (Note that both the default and maximum blocksize values
// must be a multiple of the same blocksize multiple.) Also,
// note that this is only done when the blocksize id is not
// equal to the blocksize multiple id (ie: we don't round
// down scaled register blocksizes since they are their own
// multiples).
bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the complex domain maximum blocksize values in the
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
if ( bs_id != bm_id )
{
// Round the newly-scaled blocksizes down to their multiple.
// (Note that both the default and maximum blocksize values
// must be a multiple of the same blocksize multiple.) Also,
// note that this is only done when the blocksize id is not
// equal to the blocksize multiple id (ie: we don't round
// down scaled register blocksizes since they are their own
// multiples).
bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Copy the blocksize multiple id into the context.
@@ -276,7 +312,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// Free the temporary local arrays.
bli_free_intl( bszids );
bli_free_intl( bmults );
bli_free_intl( scalrs );
bli_free_intl( dsclrs );
bli_free_intl( msclrs );
}
@@ -337,6 +374,18 @@ static func_t bli_gks_l3_ind_ukrs[BLIS_NUM_IND_METHODS]
/* trsm_l */ { { NULL, BLIS_CTRSM4M1_L_UKERNEL, NULL, BLIS_ZTRSM4M1_L_UKERNEL, } },
/* trsm_u */ { { NULL, BLIS_CTRSM4M1_U_UKERNEL, NULL, BLIS_ZTRSM4M1_U_UKERNEL, } },
},
/* 1m */ {
/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM1M_UKERNEL,
BLIS_DGEMM_UKERNEL, BLIS_ZGEMM1M_UKERNEL, } },
/* gemmtrsm_l */ { { NULL, BLIS_CGEMMTRSM1M_L_UKERNEL,
NULL, BLIS_ZGEMMTRSM1M_L_UKERNEL, } },
/* gemmtrsm_u */ { { NULL, BLIS_CGEMMTRSM1M_U_UKERNEL,
NULL, BLIS_ZGEMMTRSM1M_U_UKERNEL, } },
/* trsm_l */ { { NULL, BLIS_CTRSM1M_L_UKERNEL,
NULL, BLIS_ZTRSM1M_L_UKERNEL, } },
/* trsm_u */ { { NULL, BLIS_CTRSM1M_U_UKERNEL,
NULL, BLIS_ZTRSM1M_U_UKERNEL, } },
},
/* nat */ {
/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM_UKERNEL,
BLIS_DGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL, } },
@@ -557,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ];
bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref );
// Explicitly set the anti-preference to FALSE.
bli_cntx_set_anti_pref( FALSE, cntx );
}
@@ -565,6 +617,8 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
// -- packm structure-aware kernel structure -----------------------------------
//
// IF ENABLED: NEEDS UPDATING FOR 1M.
static func_t bli_gks_packm_struc_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */

View File

@@ -61,8 +61,10 @@ void bli_memsys_init( void )
if ( bli_memsys_is_init == TRUE ) return;
// Create and initialize a context for gemm so we have something
// to pass into bli_membrk_init_pools().
bli_gemm_cntx_init( &cntx );
// to pass into bli_membrk_init_pools(). We use BLIS_DOUBLE for
// the datatype, but the dt argument is actually only used when
// initializing contexts for induced methods.
bli_gemm_cntx_init( BLIS_DOUBLE, &cntx );
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )

View File

@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
(obj).n_panel = n0; \
}
#define bli_obj_set_panel_dims( m0, n0, obj ) \
{ \
bli_obj_set_panel_length( m0, obj ); \
bli_obj_set_panel_width( n0, obj ); \
}
#define bli_obj_set_panel_dim( panel_dim, obj ) \
{ \
(obj).pd = panel_dim; \
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
#define bli_obj_induce_trans( obj ) \
{ \
{ \
/* Induce transposition among basic fields. */ \
dim_t m_ = bli_obj_length( obj ); \
dim_t n_ = bli_obj_width( obj ); \
inc_t rs_ = bli_obj_row_stride( obj ); \
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
\
if ( bli_obj_is_upper_or_lower( obj ) ) \
bli_obj_toggle_uplo( obj ); \
\
/* Induce transposition among packed fields. */ \
dim_t m_padded_ = bli_obj_padded_length( obj ); \
dim_t n_padded_ = bli_obj_padded_width( obj ); \
dim_t m_panel_ = bli_obj_panel_length( obj ); \
dim_t n_panel_ = bli_obj_panel_width( obj ); \
\
bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
\
/* Note that this macro DOES NOT touch the transposition bit! If
the calling code is using this macro to handle an object whose

View File

@@ -654,6 +654,19 @@
bli_is_io_packed( schema ) || \
bli_is_rpi_packed( schema ) )
#define bli_is_1r_packed( schema ) \
\
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R )
#define bli_is_1e_packed( schema ) \
\
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E )
#define bli_is_1m_packed( schema ) \
\
( bli_is_1r_packed( schema ) || \
bli_is_1e_packed( schema ) )
#define bli_is_nat_packed( schema ) \
\
( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 )

View File

@@ -225,6 +225,43 @@
#include "bli_scal2jrpis.h"
// -- 1m-specific scalar macros --
#include "bli_invert1ms_mxn_diag.h"
#include "bli_scal1ms_mxn.h"
#include "bli_scal21ms_mxn_diag.h"
#include "bli_scal21ms_mxn_uplo.h"
#include "bli_set1ms_mxn.h"
#include "bli_set1ms_mxn_diag.h"
#include "bli_set1ms_mxn_uplo.h"
#include "bli_seti01ms_mxn_diag.h"
// 1e
#include "bli_copy1es.h"
#include "bli_copyj1es.h"
#include "bli_invert1es.h"
#include "bli_scal1es.h"
#include "bli_scal21es.h"
#include "bli_scal2j1es.h"
// 1r
#include "bli_copy1rs.h"
#include "bli_copyj1rs.h"
#include "bli_invert1rs.h"
#include "bli_scal1rs.h"
#include "bli_scal21rs.h"
#include "bli_scal2j1rs.h"
// -- Miscellaneous macros --

View File

@@ -224,6 +224,10 @@ typedef dcomplex f77_dcomplex;
- 1 0110 11: packed imag-only column panels
- 1 0111 10: packed real+imag row panels
- 1 0111 11: packed real+imag column panels
- 1 1000 10: packed by 1m expanded row panels
- 1 1000 11: packed by 1m expanded column panels
- 1 1001 10: packed by 1m reordered row panels
- 1 1001 11: packed by 1m reordered column panels
23 Packed panel order if upper-stored
- 0 == forward order if upper
- 1 == reverse order if upper
@@ -329,6 +333,8 @@ typedef dcomplex f77_dcomplex;
#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT )
#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT )
#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT )
@@ -348,6 +354,10 @@ typedef dcomplex f77_dcomplex;
#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0
#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT
#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0
@@ -469,13 +479,17 @@ typedef enum
BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO,
BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI,
BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI,
BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E,
BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R,
} pack_t;
// We combine row and column packing into one "type", and we start
// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the
// schema pair for "4ms" (4m separated), because its bit value has
// been reserved, even though we don't use it.
#define BLIS_NUM_PACK_SCHEMA_TYPES 8
#define BLIS_NUM_PACK_SCHEMA_TYPES 10
// -- Pack order type --
@@ -575,6 +589,7 @@ typedef enum
BLIS_4MH,
BLIS_4M1B,
BLIS_4M1A,
BLIS_1M,
BLIS_NAT,
} ind_t;
@@ -960,9 +975,11 @@ typedef struct cntx_s
opid_t family;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
pack_t schema_a_block;
pack_t schema_b_panel;
pack_t schema_c_panel;
bool_t anti_pref;
dim_t thrloop[ BLIS_NUM_LOOPS ];

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_COPY1ES_H
#define BLIS_COPY1ES_H
// copy1es
#define bli_ccopy1es( a, bri, bir ) \
{ \
bli_ccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
bli_ccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
}
#define bli_zcopy1es( a, bri, bir ) \
{ \
bli_zcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
bli_zcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
}
#endif

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_COPYJ1ES_H
#define BLIS_COPYJ1ES_H
// copyj1es
#define bli_ccopyj1es( a, bri, bir ) \
{ \
bli_ccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
bli_ccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
}
#define bli_zcopyj1es( a, bri, bir ) \
{ \
bli_zcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
bli_zcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
}
#endif

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_INVERT1ES_H
#define BLIS_INVERT1ES_H
// invert1es
#define bli_cinvert1es( bri, bir ) \
{ \
bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \
bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \
}
#define bli_zinvert1es( bri, bir ) \
{ \
bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \
bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \
}
#endif

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL1ES_H
#define BLIS_SCAL1ES_H
// scal1es
#define bli_cscal1es( a, yri, yir ) \
{ \
bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \
bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \
}
#define bli_zscal1es( a, yri, yir ) \
{ \
bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \
bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \
}
#endif

View File

@@ -0,0 +1,65 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL21ES_H
#define BLIS_SCAL21ES_H
// scal21es
#define bli_cscal21es( a, x, yri, yir ) \
{ \
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
bli_cscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
}
#define bli_zscal21es( a, x, yri, yir ) \
{ \
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
bli_zscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
}
#define bli_scscal21es( a, x, yri, yir ) \
{ \
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
bli_scscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
}
#define bli_dzscal21es( a, x, yri, yir ) \
{ \
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
}
#endif

View File

@@ -0,0 +1,65 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL2J1ES_H
#define BLIS_SCAL2J1ES_H
// scal2j1es
#define bli_cscal2j1es( a, x, yri, yir ) \
{ \
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
}
#define bli_zscal2j1es( a, x, yri, yir ) \
{ \
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
}
#define bli_scscal2j1es( a, x, yri, yir ) \
{ \
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
}
#define bli_dzscal2j1es( a, x, yri, yir ) \
{ \
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
}
#endif

View File

@@ -0,0 +1,126 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_INVERT1MS_MXN_DIAG_H
#define BLIS_INVERT1MS_MXN_DIAG_H
// invert1ms_mxn_diag
#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_off_ri = y + (offm )*rs_y \
+ (offn )*cs_y; \
scomplex* restrict y_off_ir = y + (offm )*rs_y \
+ (offn )*cs_y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_off_r = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2; \
float* restrict y_off_i = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2 + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_off_ri = y + (offm )*rs_y \
+ (offn )*cs_y; \
dcomplex* restrict y_off_ir = y + (offm )*rs_y \
+ (offn )*cs_y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_off_r = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2; \
double* restrict y_off_i = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2 + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,124 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL1MS_MXN_H
#define BLIS_SCAL1MS_MXN_H
// scal1ms_mxn
#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_ri = y; \
scomplex* restrict y_ir = y + ld_y/2; \
\
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
{ \
bli_cscal1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_r = y_cast; \
float* restrict y_i = y_cast + ld_y; \
\
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
{ \
bli_cscal1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
}
#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_ri = y; \
dcomplex* restrict y_ir = y + ld_y/2; \
\
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
{ \
bli_zscal1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop,
which steps in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_r = y_cast; \
double* restrict y_i = y_cast + ld_y; \
\
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
{ \
bli_zscal1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,126 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL21MS_MXN_DIAG_H
#define BLIS_SCAL21MS_MXN_DIAG_H
// scal21ms_mxn_diag
#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_off_ri = y; \
scomplex* restrict y_off_ir = y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_scscal21es( *(x + i*rs_x + i*cs_x), \
*(a), \
*(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_off_r = y_cast; \
float* restrict y_off_i = y_cast + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_scscal21rs( *(x + i*rs_x + i*cs_x), \
*(a), \
*(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_off_ri = y; \
dcomplex* restrict y_off_ir = y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_dzscal21es( *(x + i*rs_x + i*cs_x), \
*(a), \
*(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_off_r = y_cast; \
double* restrict y_off_i = y_cast + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_dzscal21rs( *(x + i*rs_x + i*cs_x), \
*(a), \
*(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,296 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL21MS_MXN_UPLO_H
#define BLIS_SCAL21MS_MXN_UPLO_H
// scal21ms_mxn_uplo
#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
{ \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_ri = y; \
scomplex* restrict y_ir = y + ld_y/2; \
\
if ( bli_is_lower( uplo ) ) \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_cscal2j1es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_cscal21es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_cscal2j1es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_cscal21es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_r = y_cast; \
float* restrict y_i = y_cast + ld_y; \
\
if ( bli_is_lower( uplo ) ) \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_cscal2j1rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_cscal21rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_cscal2j1rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_cscal21rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
} \
}
#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
{ \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_ri = y; \
dcomplex* restrict y_ir = y + ld_y/2; \
\
if ( bli_is_lower( uplo ) ) \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zscal2j1es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zscal21es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zscal2j1es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zscal21es( *(a), \
*(x + i*rs_x + j*cs_x), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_r = y_cast; \
double* restrict y_i = y_cast + ld_y; \
\
if ( bli_is_lower( uplo ) ) \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zscal2j1rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zscal21rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zscal2j1rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( j = 0; j < m; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zscal21rs( *(a), \
*(x + i*rs_x + j*cs_x ), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
} \
}
#endif

View File

@@ -0,0 +1,164 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SET1MS_MXN_H
#define BLIS_SET1MS_MXN_H
// set1ms_mxn
#define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
inc_t offm_local = offm; \
inc_t offn_local = offn; \
dim_t m_local = m; \
dim_t n_local = n; \
inc_t rs_y1 = rs_y; \
inc_t cs_y1 = cs_y; \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
dim_t i, j; \
\
/* Optimization: The loops walk through y with unit stride if y is
column-stored. If y is row-stored, swap the dimensions and strides
to preserve unit stride movement. */ \
if ( cs_y == 1 ) \
{ \
bli_swap_incs( offm_local, offn_local ); \
bli_swap_dims( m_local, n_local ); \
bli_swap_incs( rs_y1, cs_y1 ); \
bli_swap_incs( rs_y2, cs_y2 ); \
} \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \
+ (offn_local )*cs_y1; \
scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \
+ (offn_local )*cs_y1 + ld_y/2; \
\
for ( j = 0; j < n_local; ++j ) \
for ( i = 0; i < m_local; ++i ) \
{ \
bli_ccopy1es( *(a), \
*(y_off_ri + i*rs_y1 + j*cs_y1), \
*(y_off_ir + i*rs_y1 + j*cs_y1) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_off_r = y_cast + (offm_local )*rs_y2 \
+ (offn_local )*cs_y2; \
float* restrict y_off_i = y_cast + (offm_local )*rs_y2 \
+ (offn_local )*cs_y2 + ld_y; \
\
for ( j = 0; j < n_local; ++j ) \
for ( i = 0; i < m_local; ++i ) \
{ \
bli_ccopy1rs( *(a), \
*(y_off_r + i*rs_y2 + j*cs_y2), \
*(y_off_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
}
#define bli_zset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
inc_t offm_local = offm; \
inc_t offn_local = offn; \
dim_t m_local = m; \
dim_t n_local = n; \
inc_t rs_y1 = rs_y; \
inc_t cs_y1 = cs_y; \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
dim_t i, j; \
\
/* Optimization: The loops walk through y with unit stride if y is
column-stored. If y is row-stored, swap the dimensions and strides
to preserve unit stride movement. */ \
if ( cs_y == 1 ) \
{ \
bli_swap_incs( offm_local, offn_local ); \
bli_swap_dims( m_local, n_local ); \
bli_swap_incs( rs_y1, cs_y1 ); \
bli_swap_incs( rs_y2, cs_y2 ); \
} \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \
+ (offn_local )*cs_y1; \
dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \
+ (offn_local )*cs_y1 + ld_y/2; \
\
for ( j = 0; j < n_local; ++j ) \
for ( i = 0; i < m_local; ++i ) \
{ \
bli_zcopy1es( *(a), \
*(y_off_ri + i*rs_y1 + j*cs_y1), \
*(y_off_ir + i*rs_y1 + j*cs_y1) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_off_r = y_cast + (offm_local )*rs_y2 \
+ (offn_local )*cs_y2; \
double* restrict y_off_i = y_cast + (offm_local )*rs_y2 \
+ (offn_local )*cs_y2 + ld_y; \
\
for ( j = 0; j < n_local; ++j ) \
for ( i = 0; i < m_local; ++i ) \
{ \
bli_zcopy1rs( *(a), \
*(y_off_r + i*rs_y2 + j*cs_y2), \
*(y_off_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,130 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SET1MS_MXN_DIAG_H
#define BLIS_SET1MS_MXN_DIAG_H
// set1ms_mxn_diag
#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_off_ri = y + (offm )*rs_y \
+ (offn )*cs_y; \
scomplex* restrict y_off_ir = y + (offm )*rs_y \
+ (offn )*cs_y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_ccopy1es( *(a), \
*(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_off_r = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2; \
float* restrict y_off_i = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2 + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_ccopy1rs( *(a), \
*(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_off_ri = y + (offm )*rs_y \
+ (offn )*cs_y; \
dcomplex* restrict y_off_ir = y + (offm )*rs_y \
+ (offn )*cs_y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_zcopy1es( *(a), \
*(y_off_ri + i*rs_y + i*cs_y), \
*(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_off_r = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2; \
double* restrict y_off_i = y_cast + (offm )*rs_y2 \
+ (offn )*cs_y2 + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_zcopy1rs( *(a), \
*(y_off_r + i*rs_y2 + i*cs_y2), \
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,198 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SET1MS_MXN_UPLO_H
#define BLIS_SET1MS_MXN_UPLO_H
// set1ms_mxn_uplo
#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
doff_t diagoff_abs = bli_abs( diagoff ); \
inc_t offdiag_inc; \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
/* Set the off-diagonal increment. */ \
if ( diagoff > 0 ) offdiag_inc = cs_y; \
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
\
scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \
scomplex* restrict y_ri = y0; \
scomplex* restrict y_ir = y0 + ld_y/2; \
\
if ( bli_is_lower( uplo ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_ccopy1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_ccopy1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
/* Set the off-diagonal increment. */ \
if ( diagoff > 0 ) offdiag_inc = cs_y2; \
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
\
float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \
float* restrict y_r = y0; \
float* restrict y_i = y0 + ld_y; \
\
if ( bli_is_lower( uplo ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_ccopy1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_ccopy1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
}
#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
{ \
doff_t diagoff_abs = bli_abs( diagoff ); \
inc_t offdiag_inc; \
dim_t i, j; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
/* Set the off-diagonal increment. */ \
if ( diagoff > 0 ) offdiag_inc = cs_y; \
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
\
dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \
dcomplex* restrict y_ri = y0; \
dcomplex* restrict y_ir = y0 + ld_y/2; \
\
if ( bli_is_lower( uplo ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zcopy1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zcopy1es( *(a), \
*(y_ri + i*rs_y + j*cs_y), \
*(y_ir + i*rs_y + j*cs_y) ); \
} \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
/* Set the off-diagonal increment. */ \
if ( diagoff > 0 ) offdiag_inc = cs_y2; \
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
\
double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \
double* restrict y_r = y0; \
double* restrict y_i = y0 + ld_y; \
\
if ( bli_is_lower( uplo ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = j; i < m; ++i ) \
{ \
bli_zcopy1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < j + 1; ++i ) \
{ \
bli_zcopy1rs( *(a), \
*(y_r + i*rs_y2 + j*cs_y2), \
*(y_i + i*rs_y2 + j*cs_y2) ); \
} \
} \
} \
}
#endif

View File

@@ -0,0 +1,114 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SETI01MS_MXN_DIAG_H
#define BLIS_SETI01MS_MXN_DIAG_H
// seti01ms_mxn_diag
#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
scomplex* restrict y_off_ri = y; \
scomplex* restrict y_off_ir = y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
float* restrict y_cast = ( float* )y; \
float* restrict y_off_i = y_cast + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
{ \
dim_t min_m_n = bli_min( m, n ); \
dim_t i; \
\
/* Handle 1e and 1r separately. */ \
if ( bli_is_1e_packed( schema ) ) \
{ \
dcomplex* restrict y_off_ri = y; \
dcomplex* restrict y_off_ir = y + ld_y/2; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
} \
} \
else /* if ( bli_is_1r_packed( schema ) ) */ \
{ \
inc_t rs_y2 = rs_y; \
inc_t cs_y2 = cs_y; \
\
/* Scale the non-unit stride by two for the 1r loop, which steps
in units of real (not complex) values. */ \
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
\
double* restrict y_cast = ( double* )y; \
double* restrict y_off_i = y_cast + ld_y; \
\
for ( i = 0; i < min_m_n; ++i ) \
{ \
bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
} \
} \
}
#endif

View File

@@ -0,0 +1,51 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_COPY1RS_H
#define BLIS_COPY1RS_H
// copy1rs
#define bli_ccopy1rs( a, br, bi ) \
{ \
bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \
}
#define bli_zcopy1rs( a, br, bi ) \
{ \
bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \
}
#endif

Some files were not shown because too many files have changed in this diff Show More