mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
frame/3/gemm/bli_gemm_front.c
Change-Id: I52a0fbc1d33bb948d430942323bbc5fe44e3ca13
This commit is contained in:
2
Makefile
2
Makefile
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
30
README.md
30
README.md
@@ -260,7 +260,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving
|
||||
```
|
||||
|
||||
A fourth paper, submitted to ACM TOMS, also exists, which proposes an
|
||||
[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS:
|
||||
[analytical model](http://dl.acm.org/citation.cfm?id=2925987)
|
||||
([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf))
|
||||
for determining blocksize parameters in BLIS:
|
||||
|
||||
```
|
||||
@article{BLIS4,
|
||||
@@ -278,6 +280,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an
|
||||
}
|
||||
```
|
||||
|
||||
A fifth paper, submitted to ACM TOMS, begins the study of so-called
|
||||
[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf):
|
||||
|
||||
```
|
||||
@article{BLIS5,
|
||||
author = {Field G. {V}an~{Z}ee and Tyler Smith},
|
||||
title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods},
|
||||
journal = {ACM Transactions on Mathematical Software},
|
||||
year = {2017},
|
||||
note = {accepted}
|
||||
}
|
||||
```
|
||||
|
||||
A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
|
||||
article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf):
|
||||
|
||||
```
|
||||
@article{BLIS6,
|
||||
author = {Field G. {V}an~{Z}ee},
|
||||
title = {Implementing high-performance complex matrix multiplication via the 1m method},
|
||||
journal = {ACM Transactions on Mathematical Software},
|
||||
note = {submitted}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Funding
|
||||
-------
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -41,7 +41,11 @@
|
||||
|
||||
main()
|
||||
{
|
||||
CC=gcc
|
||||
if [ clang -v > /dev/null 2>&1 ]; then
|
||||
CC=clang
|
||||
else
|
||||
CC=gcc
|
||||
fi
|
||||
CPUID_SRC=cpuid_x86.c
|
||||
CPUID_BIN=blis_cpu_detect
|
||||
ARCH=reference
|
||||
@@ -59,12 +63,6 @@ main()
|
||||
# of the distribution and the directory in which we are building.
|
||||
cur_dirpath="."
|
||||
|
||||
|
||||
OSNAME=`uname`
|
||||
if [ $OSNAME = "Darwin" ]; then
|
||||
CC=clang
|
||||
fi
|
||||
|
||||
#
|
||||
# Detect architecture by predefined macros
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -47,4 +47,4 @@ if [ $? -eq 0 ]; then
|
||||
else
|
||||
echo "Test Pass"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -374,10 +374,6 @@ gen_mkfiles()
|
||||
|
||||
read_mkfile_config()
|
||||
{
|
||||
local index lname
|
||||
declare -i count
|
||||
|
||||
|
||||
# Read the file describing file suffixes.
|
||||
src_file_suffixes=$(cat "${suffix_file}")
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -47,8 +47,12 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
@@ -77,7 +81,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -47,8 +47,12 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
@@ -77,7 +81,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -88,7 +88,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
|
||||
@@ -52,17 +52,6 @@
|
||||
|
||||
// -- sgemm micro-kernel --
|
||||
|
||||
#if 1
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 6
|
||||
#define BLIS_DEFAULT_NR_S 16
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
@@ -74,6 +63,17 @@
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 6
|
||||
#define BLIS_DEFAULT_NR_S 16
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
@@ -85,17 +85,6 @@
|
||||
|
||||
// -- dgemm micro-kernel --
|
||||
|
||||
#if 1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 6
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
|
||||
#define BLIS_DEFAULT_MC_D 152
|
||||
@@ -107,6 +96,17 @@
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 6
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -88,7 +88,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -43,11 +43,22 @@
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
#ifdef BLIS_NO_HBWMALLOC
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define BLIS_MALLOC_POOL malloc
|
||||
#define BLIS_FREE_POOL free
|
||||
|
||||
#else
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
#define BLIS_MALLOC_POOL hbw_malloc
|
||||
#define BLIS_FREE_POOL hbw_free
|
||||
|
||||
#endif
|
||||
|
||||
//#define BLIS_MALLOC_INTL hbw_malloc
|
||||
//#define BLIS_FREE_INTL hbw_free
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -65,6 +65,10 @@ else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),sde)
|
||||
CPPROCFLAGS += -DBLIS_NO_HBWMALLOC
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
@@ -95,7 +99,16 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -lmemkind
|
||||
|
||||
ifneq ($(DEBUG_TYPE),sde)
|
||||
LDFLAGS := -lmemkind
|
||||
else
|
||||
LDFLAGS :=
|
||||
endif
|
||||
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS += -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,11 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -mmic
|
||||
else
|
||||
LDFLAGS := -mmic -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -47,8 +47,12 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
@@ -77,7 +81,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -63,7 +63,9 @@ ARFLAGS := rcs
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS :=
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
# --- Determine the finalizer and related flags ---
|
||||
FINALIZER := pnacl-finalize
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -86,7 +86,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -88,7 +88,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -77,7 +77,9 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS := -lm
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
5
configure
vendored
5
configure
vendored
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
@@ -458,6 +458,9 @@ main()
|
||||
if [ -n "${debug_flag}" ]; then
|
||||
if [ "x${debug_type}" = "xopt" ]; then
|
||||
echo "${script_name}: enabling debug symbols with optimizations."
|
||||
elif [ "x${debug_type}" = "xsde" ]; then
|
||||
debug_type='sde'
|
||||
echo "${script_name}: enabling SDE processor emulation."
|
||||
else
|
||||
debug_type='noopt'
|
||||
echo "${script_name}: enabling debug symbols; optimizations disabled."
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
@@ -68,15 +68,15 @@ GENFRONT( swapv, BLIS_SWAPV_KER )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, dep1, dep2, dep3, dep4 ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(dep1,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep2,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep3,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep4,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(dep2,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(dep3,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(dep4,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -93,12 +93,12 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, depname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -116,13 +116,13 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, dep1, dep2 ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(dep1,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep2,_cntx_init)( cntx ); \
|
||||
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(dep2,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( addv )
|
||||
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -88,7 +88,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -123,7 +123,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -198,7 +198,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -274,7 +274,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -306,7 +306,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -340,7 +340,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -373,7 +373,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
|
||||
@@ -41,12 +41,12 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, depname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
} \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( addd )
|
||||
|
||||
@@ -90,7 +90,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Query the context for the operation's kernel address. */ \
|
||||
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
@@ -166,7 +166,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Query the context for the operation's kernel address. */ \
|
||||
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
@@ -222,7 +222,7 @@ void PASTEMAC(ch,opname) \
|
||||
x1 = x + offx; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Query the context for the operation's kernel address. */ \
|
||||
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
@@ -276,7 +276,7 @@ void PASTEMAC(ch,opname) \
|
||||
x1 = x + offx; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Query the context for the operation's kernel address. */ \
|
||||
PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \
|
||||
@@ -349,7 +349,7 @@ void PASTEMAC(ch,opname) \
|
||||
incx = 2*incx; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Query the context for the operation's kernel address. */ \
|
||||
PASTECH2(chr,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx_p ); \
|
||||
|
||||
@@ -41,12 +41,12 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, depname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -63,13 +63,13 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, depname1, depname2 ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -86,12 +86,12 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, depname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -114,13 +114,13 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, kertype, depname1, depname2 ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( axpy2v )
|
||||
|
||||
@@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -99,7 +99,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -142,7 +142,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
@@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \
|
||||
\
|
||||
|
||||
@@ -41,12 +41,12 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, depname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
} \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
@@ -64,13 +64,13 @@ GENFRONT( subm, subv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, depname1, depname2 ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( cntx ); \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
PASTEMAC(depname2,_cntx_init)( dt, cntx ); \
|
||||
} \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( addm )
|
||||
|
||||
@@ -91,6 +91,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
|
||||
);
|
||||
|
||||
INSERT_GENTDEF( packm_cxk_ker )
|
||||
INSERT_GENTDEF( packm_cxk_1er_ker )
|
||||
|
||||
|
||||
// packm_3mis_ker
|
||||
|
||||
@@ -54,12 +54,13 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Invoke the helper variant, which loops over the appropriate kernel
|
||||
to implement the current operation. */ \
|
||||
@@ -118,12 +119,13 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Invoke the helper variant, which loops over the appropriate kernel
|
||||
to implement the current operation. */ \
|
||||
@@ -187,7 +189,8 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
@@ -195,7 +198,7 @@ void PASTEMAC(ch,opname) \
|
||||
if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Invoke the helper variant, which loops over the appropriate kernel
|
||||
to implement the current operation. */ \
|
||||
@@ -256,12 +259,13 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* If alpha is zero, then we set the output matrix to zero. This
|
||||
seemingly minor optimization is important because it will clear
|
||||
@@ -344,12 +348,13 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Invoke the helper variant, which loops over the appropriate kernel
|
||||
to implement the current operation. */ \
|
||||
|
||||
@@ -48,9 +48,11 @@
|
||||
#include "bli_packm_struc_cxk_4mi.h"
|
||||
#include "bli_packm_struc_cxk_3mis.h"
|
||||
#include "bli_packm_struc_cxk_rih.h"
|
||||
#include "bli_packm_struc_cxk_1er.h"
|
||||
|
||||
#include "bli_packm_cxk.h"
|
||||
#include "bli_packm_cxk_4mi.h"
|
||||
#include "bli_packm_cxk_3mis.h"
|
||||
#include "bli_packm_cxk_rih.h"
|
||||
#include "bli_packm_cxk_1er.h"
|
||||
|
||||
|
||||
@@ -90,6 +90,12 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
// 0111 row/col panels: real+imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 1000 row/col panels: 1m-expanded (1e)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
// 1001 row/col panels: 1m-reordered (1r)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
// Define context initialization functions.
|
||||
//
|
||||
|
||||
void bli_packm_cntx_init( cntx_t* cntx )
|
||||
void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_obj_create( cntx );
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( packm )
|
||||
|
||||
489
frame/1m/packm/bli_packm_cxk_1er.c
Normal file
489
frame/1m/packm/bli_packm_cxk_1er.c
Normal file
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_cxk_1er_ker_vft
|
||||
|
||||
#undef FUNCPTR_ARRAY_LENGTH
|
||||
#define FUNCPTR_ARRAY_LENGTH 32
|
||||
|
||||
static FUNCPTR_T ftypes_e[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
/* micro-panel width = 0 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 1 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 2 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_2XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_2XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 3 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 4 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_4XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_4XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 5 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 6 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_6XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_6XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 7 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 8 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_8XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_8XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 9 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 10 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_10XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_10XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 11 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 12 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_12XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_12XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 13 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 14 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_14XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_14XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 15 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 16 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_16XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_16XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 17 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 18 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 19 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 20 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 21 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 22 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 23 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 24 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 25 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 26 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 27 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 28 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 29 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 30 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_30XK_1E_KERNEL,
|
||||
NULL, BLIS_ZPACKM_30XK_1E_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 31 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
};
|
||||
|
||||
static FUNCPTR_T ftypes_r[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
/* micro-panel width = 0 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 1 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 2 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_2XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_2XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 3 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_3XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_3XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 4 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_4XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_4XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 5 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 6 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_6XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_6XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 7 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 8 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_8XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_8XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 9 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 10 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_10XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_10XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 11 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 12 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_12XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_12XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 13 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 14 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_14XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_14XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 15 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 16 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_16XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_16XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 17 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 18 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 19 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 20 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 21 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 22 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 23 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 24 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 25 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 26 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 27 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 28 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 29 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
/* micro-panel width = 30 */
|
||||
{
|
||||
NULL, BLIS_CPACKM_30XK_1R_KERNEL,
|
||||
NULL, BLIS_ZPACKM_30XK_1R_KERNEL,
|
||||
},
|
||||
/* micro-panel width = 31 */
|
||||
{
|
||||
NULL, NULL, NULL, NULL,
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
num_t dt; \
|
||||
FUNCPTR_T f; \
|
||||
\
|
||||
/* Acquire the datatype for the current function. */ \
|
||||
dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Index into the array to extract the correct function pointer.
|
||||
If the micro-panel dimension is too big to be within the array of
|
||||
explicitly handled kernels, then we treat that kernel the same
|
||||
as if it were in range but unimplemented. */ \
|
||||
if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) \
|
||||
{ \
|
||||
if ( bli_is_1e_packed( schema ) ) f = ftypes_e[panel_dim][dt]; \
|
||||
else /*( bli_is_1r_packed( schema ) )*/ f = ftypes_r[panel_dim][dt]; \
|
||||
} \
|
||||
else f = NULL; \
|
||||
\
|
||||
/* If there exists a kernel implementation for the micro-panel dimension
|
||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||
if ( f != NULL ) \
|
||||
{ \
|
||||
f \
|
||||
( \
|
||||
conja, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, ldp \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
\
|
||||
ctype* restrict kappa_cast = ( ctype* )kappa; \
|
||||
ctype* restrict a_ri = ( ctype* )a; \
|
||||
ctype* restrict p_ri = ( ctype* )p; \
|
||||
ctype* restrict p_ir = ( ctype* )p + ldp/2; \
|
||||
\
|
||||
/* Treat the micro-panel as panel_dim x panel_len and column-stored
|
||||
(unit row stride). */ \
|
||||
\
|
||||
/* NOTE: The loops below are inlined versions of scal2m, but
|
||||
for separated real/imaginary storage. */ \
|
||||
\
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < panel_len; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \
|
||||
ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
|
||||
ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal2j1es)( *kappa_cast, \
|
||||
*alpha11_ri, \
|
||||
*pi11_ri, \
|
||||
*pi11_ir ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < panel_len; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \
|
||||
ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
|
||||
ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal21es)( *kappa_cast, \
|
||||
*alpha11_ri, \
|
||||
*pi11_ri, \
|
||||
*pi11_ir ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
ctype_r* restrict kappa_r = ( ctype_r* )kappa; \
|
||||
ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + ldp; \
|
||||
const dim_t inca2 = 2*inca; \
|
||||
const dim_t lda2 = 2*lda; \
|
||||
const dim_t ldp2 = 2*ldp; \
|
||||
\
|
||||
/* Treat the micro-panel as panel_dim x panel_len and column-stored
|
||||
(unit row stride). */ \
|
||||
\
|
||||
/* NOTE: The loops below are inlined versions of scal2m, but
|
||||
for separated real/imaginary storage. */ \
|
||||
\
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < panel_len; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \
|
||||
\
|
||||
PASTEMAC(ch,scal2jris)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < panel_len; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \
|
||||
\
|
||||
PASTEMAC(ch,scal2ris)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er )
|
||||
|
||||
55
frame/1m/packm/bli_packm_cxk_1er.h
Normal file
55
frame/1m/packm/bli_packm_cxk_1er.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_packm_cxk_1e_ref.h"
|
||||
#include "bli_packm_cxk_1r_ref.h"
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_cxk_1er )
|
||||
|
||||
@@ -121,11 +121,11 @@ siz_t bli_packm_init
|
||||
|
||||
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
|
||||
{
|
||||
schema = bli_cntx_get_pack_schema_a( cntx );
|
||||
schema = bli_cntx_get_pack_schema_a_block( cntx );
|
||||
}
|
||||
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
|
||||
{
|
||||
schema = bli_cntx_get_pack_schema_b( cntx );
|
||||
schema = bli_cntx_get_pack_schema_b_panel( cntx );
|
||||
}
|
||||
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
|
||||
{
|
||||
|
||||
610
frame/1m/packm/bli_packm_struc_cxk_1er.c
Normal file
610
frame/1m/packm/bli_packm_struc_cxk_1er.c
Normal file
@@ -0,0 +1,610 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* Determine the dimensions and relative strides of the micro-panel
|
||||
based on its pack schema. */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
{ \
|
||||
/* For micro-panels of general matrices, we can call the pack
|
||||
kernel front-end directly. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of Hermitian/symmetric
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_herm_cxk_1er) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of triangular
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_tri_cxk_1er) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
{ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t offm = m_panel; \
|
||||
dim_t offn = 0; \
|
||||
dim_t m_edge = m_panel_max - m_panel; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t offm = 0; \
|
||||
dim_t offn = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - n_panel; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this micro-panel is an edge case in both panel dimension and
|
||||
length, then it must be a bottom-right corner case, which
|
||||
typically only happens for micro-panels being packed for trsm.
|
||||
(It also happens for trmm if kr > 1.) Here, we set the part of
|
||||
the diagonal that extends into the zero-padded region to
|
||||
identity. This prevents NaNs and Infs from creeping into the
|
||||
computation. If this code does execute for trmm, it is okay,
|
||||
because those 1.0's that extend into the bottom-right region
|
||||
end up getting muliplied by the 0.0's in the zero-padded region
|
||||
of the other matrix. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t offm = m_panel; \
|
||||
dim_t offn = n_panel; \
|
||||
dim_t m_edge = m_panel_max - m_panel; \
|
||||
dim_t n_edge = n_panel_max - n_panel; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
one, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( bli_is_1r_packed( schema ) ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \
|
||||
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
\
|
||||
if ( bli_is_1e_packed( schema ) ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \
|
||||
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t j; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
ctype* restrict c10; \
|
||||
ctype* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc10, \
|
||||
schema, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc12, \
|
||||
schema, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
dim_t j = diagoffc_abs; \
|
||||
ctype* restrict c11 = c + (j )*ldc; \
|
||||
ctype* restrict p11 = p + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal21ms_mxn_uplo) \
|
||||
( \
|
||||
schema, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
); \
|
||||
\
|
||||
/* If we are packing a micro-panel with Hermitian structure,
|
||||
we must take special care of the diagonal. Now, if kappa
|
||||
were guaranteed to be unit, all we would need to do is
|
||||
explicitly zero out the imaginary part of the diagonal of
|
||||
p11, in case the diagonal of the source matrix contained
|
||||
garbage (non-zero) imaginary values. HOWEVER, since kappa
|
||||
can be non-unit, things become a little more complicated.
|
||||
In general, we must re-apply the kappa scalar to ONLY the
|
||||
real part of the diagonal of the source matrix and save
|
||||
the result to the diagonal of p11. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
ctype_r* restrict c11_r = ( ctype_r* )c11; \
|
||||
const dim_t rs_c2 = 2*rs_c; \
|
||||
const dim_t cs_c2 = 2*cs_c; \
|
||||
\
|
||||
PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
c11_r, rs_c2, cs_c2, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffp_abs = bli_abs( diagoffp ); \
|
||||
ctype* p11 = p + (diagoffp_abs )*ldp; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,set1ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
0, \
|
||||
0, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
PASTEMAC(ch,invert1ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
0, \
|
||||
0, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
uplo_t uplop = uploc; \
|
||||
doff_t diagoffp11_0 = 0; \
|
||||
dim_t p11_0_dim = panel_dim - 1; \
|
||||
\
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp11_0 ); \
|
||||
\
|
||||
/* Note that this macro works a little differently than the setm
|
||||
operation. Here, we pass in the dimensions of only p11, rather
|
||||
than the whole micro-panel, and furthermore we pass in the
|
||||
"shrunken" dimensions of p11, corresponding to the toggling
|
||||
and shrinking of the diagonal above. The macro will do the
|
||||
right thing, incrementing the pointer to p11 by the appropriate
|
||||
leading dimension (cs_p or rs_p), and setting only the lower
|
||||
or upper triangle to zero. */ \
|
||||
PASTEMAC(ch,set1ms_mxn_uplo) \
|
||||
( \
|
||||
schema, \
|
||||
diagoffp11_0, \
|
||||
uplop, \
|
||||
p11_0_dim, \
|
||||
p11_0_dim, \
|
||||
zero, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )
|
||||
|
||||
117
frame/1m/packm/bli_packm_struc_cxk_1er.h
Normal file
117
frame/1m/packm/bli_packm_struc_cxk_1er.h
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_1er )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_1er )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_1er )
|
||||
|
||||
1099
frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c
Normal file
1099
frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c
Normal file
File diff suppressed because it is too large
Load Diff
62
frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h
Normal file
62
frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// Redefine level-1m kernel API names to induce prototypes.
|
||||
|
||||
#undef packm_2xk_ker_name
|
||||
#define packm_2xk_ker_name packm_2xk_1e_ref
|
||||
// 1e format should probably never have an odd-numbered register blocking.
|
||||
//#undef packm_3xk_ker_name
|
||||
//#define packm_3xk_ker_name packm_3xk_1e_ref
|
||||
#undef packm_4xk_ker_name
|
||||
#define packm_4xk_ker_name packm_4xk_1e_ref
|
||||
#undef packm_6xk_ker_name
|
||||
#define packm_6xk_ker_name packm_6xk_1e_ref
|
||||
#undef packm_8xk_ker_name
|
||||
#define packm_8xk_ker_name packm_8xk_1e_ref
|
||||
#undef packm_10xk_ker_name
|
||||
#define packm_10xk_ker_name packm_10xk_1e_ref
|
||||
#undef packm_12xk_ker_name
|
||||
#define packm_12xk_ker_name packm_12xk_1e_ref
|
||||
#undef packm_14xk_ker_name
|
||||
#define packm_14xk_ker_name packm_14xk_1e_ref
|
||||
#undef packm_16xk_ker_name
|
||||
#define packm_16xk_ker_name packm_16xk_1e_ref
|
||||
#undef packm_30xk_ker_name
|
||||
#define packm_30xk_ker_name packm_30xk_1e_ref
|
||||
|
||||
// Include the level-1m kernel API template.
|
||||
|
||||
#include "bli_l1m_ker.h"
|
||||
|
||||
1254
frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c
Normal file
1254
frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c
Normal file
File diff suppressed because it is too large
Load Diff
61
frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h
Normal file
61
frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// Redefine level-1m kernel API names to induce prototypes.
|
||||
|
||||
#undef packm_2xk_ker_name
|
||||
#define packm_2xk_ker_name packm_2xk_1r_ref
|
||||
#undef packm_3xk_ker_name
|
||||
#define packm_3xk_ker_name packm_3xk_1r_ref
|
||||
#undef packm_4xk_ker_name
|
||||
#define packm_4xk_ker_name packm_4xk_1r_ref
|
||||
#undef packm_6xk_ker_name
|
||||
#define packm_6xk_ker_name packm_6xk_1r_ref
|
||||
#undef packm_8xk_ker_name
|
||||
#define packm_8xk_ker_name packm_8xk_1r_ref
|
||||
#undef packm_10xk_ker_name
|
||||
#define packm_10xk_ker_name packm_10xk_1r_ref
|
||||
#undef packm_12xk_ker_name
|
||||
#define packm_12xk_ker_name packm_12xk_1r_ref
|
||||
#undef packm_14xk_ker_name
|
||||
#define packm_14xk_ker_name packm_14xk_1r_ref
|
||||
#undef packm_16xk_ker_name
|
||||
#define packm_16xk_ker_name packm_16xk_1r_ref
|
||||
#undef packm_30xk_ker_name
|
||||
#define packm_30xk_ker_name packm_30xk_1r_ref
|
||||
|
||||
// Include the level-1m kernel API template.
|
||||
|
||||
#include "bli_l1m_ker.h"
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
@@ -50,20 +50,20 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
operation. */ \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \
|
||||
bli_axpyf_cntx_init( cntx ); \
|
||||
bli_dotxf_cntx_init( cntx ); \
|
||||
bli_axpyf_cntx_init( dt, cntx ); \
|
||||
bli_dotxf_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \
|
||||
bli_axpyv_cntx_init( cntx ); \
|
||||
bli_dotxv_cntx_init( cntx ); \
|
||||
bli_scalv_cntx_init( cntx ); \
|
||||
bli_setv_cntx_init( cntx ); \
|
||||
bli_axpyv_cntx_init( dt, cntx ); \
|
||||
bli_dotxv_cntx_init( dt, cntx ); \
|
||||
bli_scalv_cntx_init( dt, cntx ); \
|
||||
bli_setv_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with packm-related kernels. */ \
|
||||
bli_packm_cntx_init( cntx ); \
|
||||
bli_packm_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Set the register and cache blocksizes and multiples, as well
|
||||
as the execution method. */ \
|
||||
@@ -88,7 +88,7 @@ GENFRONT( trsv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
@@ -96,10 +96,10 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
/* Initialize the context with kernels employed by the current
|
||||
operation. */ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
|
||||
bli_axpyv_cntx_init( cntx ); \
|
||||
bli_axpyv_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with packm-related kernels. */ \
|
||||
bli_packm_cntx_init( cntx ); \
|
||||
bli_packm_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Set the register and cache blocksizes and multiples, as well
|
||||
as the execution method. */ \
|
||||
@@ -122,7 +122,7 @@ GENFRONT( syr )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
@@ -133,22 +133,22 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_DOTXAXPYF_KER, cntx );*/ \
|
||||
bli_dotaxpyv_cntx_init( cntx ); \
|
||||
bli_axpyf_cntx_init( cntx ); \
|
||||
bli_dotxf_cntx_init( cntx ); \
|
||||
bli_dotxaxpyf_cntx_init( cntx ); \
|
||||
bli_dotaxpyv_cntx_init( dt, cntx ); \
|
||||
bli_axpyf_cntx_init( dt, cntx ); \
|
||||
bli_dotxf_cntx_init( dt, cntx ); \
|
||||
bli_dotxaxpyf_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \
|
||||
bli_axpyv_cntx_init( cntx ); \
|
||||
bli_dotxv_cntx_init( cntx ); \
|
||||
bli_scalv_cntx_init( cntx ); \
|
||||
bli_setv_cntx_init( cntx ); \
|
||||
bli_axpyv_cntx_init( dt, cntx ); \
|
||||
bli_dotxv_cntx_init( dt, cntx ); \
|
||||
bli_scalv_cntx_init( dt, cntx ); \
|
||||
bli_setv_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with packm-related kernels. */ \
|
||||
bli_packm_cntx_init( cntx ); \
|
||||
bli_packm_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Set the register and cache blocksizes and multiples, as well
|
||||
as the execution method. */ \
|
||||
@@ -173,7 +173,7 @@ GENFRONT( symv )
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
@@ -182,11 +182,11 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \
|
||||
operation. */ \
|
||||
/*bli_gks_cntx_set_l1f_ker( BLIS_AXPY2V_KER, cntx );*/ \
|
||||
/*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \
|
||||
bli_axpy2v_cntx_init( cntx ); \
|
||||
bli_axpyv_cntx_init( cntx ); \
|
||||
bli_axpy2v_cntx_init( dt, cntx ); \
|
||||
bli_axpyv_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Initialize the context with packm-related kernels. */ \
|
||||
bli_packm_cntx_init( cntx ); \
|
||||
bli_packm_cntx_init( dt, cntx ); \
|
||||
\
|
||||
/* Set the register and cache blocksizes and multiples, as well
|
||||
as the execution method. */ \
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( gemv )
|
||||
|
||||
@@ -55,8 +55,9 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
dim_t m_y, n_x; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
dim_t m_y, n_x; \
|
||||
\
|
||||
/* Determine the dimensions of y and x. */ \
|
||||
bli_set_dims_with_trans( transa, m, n, m_y, n_x ); \
|
||||
@@ -65,7 +66,7 @@ void PASTEMAC(ch,opname) \
|
||||
if ( bli_zero_dim1( m_y ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* If x has zero elements, or if alpha is zero, scale y by beta and
|
||||
return early. */ \
|
||||
@@ -135,13 +136,14 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
/* If x or y has zero elements, or if alpha is zero, return early. */ \
|
||||
if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Declare a void function pointer for the current operation. */ \
|
||||
PASTECH2(ch,ftname,_ft) f; \
|
||||
@@ -188,10 +190,11 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* If x has zero elements, or if alpha is zero, scale y by beta and
|
||||
return early. */ \
|
||||
@@ -261,8 +264,9 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
ctype alpha_local; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
ctype alpha_local; \
|
||||
\
|
||||
/* If x has zero elements, or if alpha is zero, return early. */ \
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \
|
||||
@@ -273,7 +277,7 @@ void PASTEMAC(ch,opname) \
|
||||
PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Declare a void function pointer for the current operation. */ \
|
||||
PASTECH2(ch,ftname,_ft) f; \
|
||||
@@ -324,13 +328,14 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
/* If x has zero elements, or if alpha is zero, return early. */ \
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Declare a void function pointer for the current operation. */ \
|
||||
PASTECH2(ch,ftname,_ft) f; \
|
||||
@@ -383,13 +388,14 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
/* If x has zero elements, or if alpha is zero, return early. */ \
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* Declare a void function pointer for the current operation. */ \
|
||||
PASTECH2(ch,ftname,_ft) f; \
|
||||
@@ -444,10 +450,11 @@ void PASTEMAC(ch,opname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
cntx_t* cntx_p; \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
cntx_t* cntx_p; \
|
||||
\
|
||||
/* Initialize a local context if the given context is NULL. */ \
|
||||
bli_cntx_init_local_if( opname, cntx, cntx_p ); \
|
||||
bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \
|
||||
\
|
||||
/* If x has zero elements, return early. */ \
|
||||
if ( bli_zero_dim1( m ) ) return; \
|
||||
|
||||
@@ -70,8 +70,8 @@ void bli_l3_cntl_create_if
|
||||
else
|
||||
{
|
||||
// If the user provided a control tree, create a copy and use it
|
||||
// instead (so that it can be used to cache things like pack mem_t
|
||||
// entries).
|
||||
// instead (so that threads can use its local tree as a place to
|
||||
// cache things like pack mem_t entries).
|
||||
*cntl_use = bli_cntl_copy( cntl_orig );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
// Define context initialization functions.
|
||||
//
|
||||
|
||||
void bli_gemm_cntx_init( cntx_t* cntx )
|
||||
void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
@@ -49,7 +49,7 @@ void bli_gemm_cntx_init( cntx_t* cntx )
|
||||
bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx );
|
||||
|
||||
// Initialize the context with packm-related kernels.
|
||||
bli_packm_cntx_init( cntx );
|
||||
bli_packm_cntx_init( dt, cntx );
|
||||
|
||||
// Initialize the context with the current architecture's register
|
||||
// and cache blocksizes (and multiples), given the execution method.
|
||||
@@ -63,9 +63,8 @@ void bli_gemm_cntx_init( cntx_t* cntx )
|
||||
cntx );
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
}
|
||||
|
||||
void bli_gemm_cntx_finalize( cntx_t* cntx )
|
||||
@@ -74,7 +73,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_trsm_cntx_init( cntx_t* cntx )
|
||||
void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
@@ -92,7 +91,7 @@ void bli_trsm_cntx_init( cntx_t* cntx )
|
||||
bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx );
|
||||
|
||||
// Initialize the context with packm-related kernels.
|
||||
bli_packm_cntx_init( cntx );
|
||||
bli_packm_cntx_init( dt, cntx );
|
||||
|
||||
// Initialize the context with the current architecture's register
|
||||
// and cache blocksizes (and multiples), given the execution method.
|
||||
@@ -106,9 +105,8 @@ void bli_trsm_cntx_init( cntx_t* cntx )
|
||||
cntx );
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
}
|
||||
|
||||
void bli_trsm_cntx_finalize( cntx_t* cntx )
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx );
|
||||
|
||||
GENPROT( gemm )
|
||||
|
||||
@@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var2;
|
||||
return bli_gemmbp_cntl_create( family );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var2;
|
||||
|
||||
// Change the macro-kernel if the operation family is herk or trmm.
|
||||
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
|
||||
@@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
|
||||
// Create a node for packing matrix A.
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packa,
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_MR,
|
||||
BLIS_KR,
|
||||
@@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create
|
||||
// Create a node for packing matrix B.
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packb,
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_KR,
|
||||
BLIS_NR,
|
||||
@@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create
|
||||
return gemm_cntl_vl_mm;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = bli_gemm_ker_var1;
|
||||
|
||||
// Change the macro-kernel if the operation family is herk or trmm.
|
||||
//if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
|
||||
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
gemm_cntl_ub_ke
|
||||
);
|
||||
|
||||
// Create a node for packing matrix A (which is really the right-hand
|
||||
// operand "B").
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_KR,
|
||||
BLIS_MR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
gemm_cntl_pb_ub
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by MC.
|
||||
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_MC,
|
||||
bli_gemm_blk_var2,
|
||||
gemm_cntl_packb
|
||||
);
|
||||
|
||||
// Create a node for packing matrix B (which is really the left-hand
|
||||
// operand "A").
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
(
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_NR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
gemm_cntl_op_pb
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_KC,
|
||||
bli_gemm_blk_var3,
|
||||
gemm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by NC.
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
|
||||
(
|
||||
BLIS_NC,
|
||||
bli_gemm_blk_var1,
|
||||
gemm_cntl_mm_op
|
||||
);
|
||||
|
||||
return gemm_cntl_vl_mm;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
|
||||
@@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create
|
||||
opid_t family
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemmbp_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
);
|
||||
|
||||
cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
opid_t family
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
|
||||
@@ -112,5 +112,6 @@ void bli_gemm_front
|
||||
cntl
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
56
frame/3/gemm/bli_gemm_ker_var1.c
Normal file
56
frame/3/gemm/bli_gemm_ker_var1.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_ker_var1
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// Implement _ker_var1() in terms of _ker_var2() by transposing the
|
||||
// entire suboperation (which also requires swapping A and B).
|
||||
|
||||
bli_obj_induce_trans( *a );
|
||||
bli_obj_induce_trans( *b );
|
||||
bli_obj_induce_trans( *c );
|
||||
|
||||
bli_gemm_ker_var2( b, a, c, cntx, cntl, thread );
|
||||
}
|
||||
|
||||
@@ -109,6 +109,26 @@ void bli_gemm_ker_var2
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( *c );
|
||||
|
||||
// If 1m is being employed on a column- or row-stored matrix with a
|
||||
// real-valued beta, we can use the real domain macro-kernel, which
|
||||
// eliminates a little overhead associated with the 1m virtual
|
||||
// micro-kernel.
|
||||
#if 1
|
||||
if ( bli_is_1m_packed( schema_a ) )
|
||||
{
|
||||
bli_l3_ind_recast_1m_params
|
||||
(
|
||||
dt_exec,
|
||||
schema_a,
|
||||
c,
|
||||
m, n, k,
|
||||
pd_a, ps_a,
|
||||
pd_b, ps_b,
|
||||
rs_c, cs_c
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
@@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 )
|
||||
GENPROT( gemm_packa )
|
||||
GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var1 )
|
||||
GENPROT( gemm_ker_var2 )
|
||||
|
||||
// Headers for induced algorithms:
|
||||
|
||||
@@ -85,6 +85,7 @@ void bli_blksz_obj_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
void bli_blksz_reduce_dt_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
@@ -116,6 +117,66 @@ void bli_blksz_reduce_dt_to
|
||||
bli_blksz_set_def( blksz_def, dt_bs, blksz );
|
||||
bli_blksz_set_max( blksz_max, dt_bs, blksz );
|
||||
}
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_blksz_reduce_def_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
)
|
||||
{
|
||||
dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz );
|
||||
|
||||
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
|
||||
|
||||
// If the blocksize multiple is zero, we do nothing.
|
||||
if ( bmult_val == 0 ) return;
|
||||
|
||||
// Round the default and maximum blocksize values down to their
|
||||
// respective nearest multiples of bmult_val. (Notice that we
|
||||
// ignore the "max" entry in the bmult object since that would
|
||||
// correspond to the packing dimension, which plays no role
|
||||
// as a blocksize multiple.)
|
||||
blksz_def = ( blksz_def / bmult_val ) * bmult_val;
|
||||
|
||||
// Make sure the new blocksize values are at least the blocksize
|
||||
// multiple.
|
||||
if ( blksz_def == 0 ) blksz_def = bmult_val;
|
||||
|
||||
// Store the new blocksizes back to the object.
|
||||
bli_blksz_set_def( blksz_def, dt_bs, blksz );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_blksz_reduce_max_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
)
|
||||
{
|
||||
dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz );
|
||||
|
||||
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
|
||||
|
||||
// If the blocksize multiple is zero, we do nothing.
|
||||
if ( bmult_val == 0 ) return;
|
||||
|
||||
// Round the blocksize values down to its nearest multiple of
|
||||
// of bmult_val. (Notice that we ignore the "max" entry in the
|
||||
// bmult object since that would correspond to the packing
|
||||
// dimension, which plays no role as a blocksize multiple.)
|
||||
blksz_max = ( blksz_max / bmult_val ) * bmult_val;
|
||||
|
||||
// Make sure the new blocksize value is at least the blocksize
|
||||
// multiple.
|
||||
if ( blksz_max == 0 ) blksz_max = bmult_val;
|
||||
|
||||
// Store the new blocksize back to the object.
|
||||
bli_blksz_set_max( blksz_max, dt_bs, blksz );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -89,11 +89,23 @@
|
||||
(b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \
|
||||
}
|
||||
|
||||
#define bli_blksz_scale_def( num, den, dt, b ) \
|
||||
{ \
|
||||
(b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \
|
||||
}
|
||||
|
||||
#define bli_blksz_scale_max( num, den, dt, b ) \
|
||||
{ \
|
||||
(b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \
|
||||
}
|
||||
|
||||
#if 0
|
||||
#define bli_blksz_scale_dt_by( num, den, dt, b ) \
|
||||
{ \
|
||||
(b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \
|
||||
(b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \
|
||||
}
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -121,12 +133,25 @@ void bli_blksz_obj_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
void bli_blksz_reduce_dt_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
);
|
||||
#endif
|
||||
|
||||
void bli_blksz_reduce_def_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
);
|
||||
|
||||
void bli_blksz_reduce_max_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
);
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_determine_blocksize
|
||||
|
||||
@@ -97,6 +97,16 @@ void bli_cntl_free
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread );
|
||||
else bli_cntl_free_wo_thrinfo( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_free_w_thrinfo
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// Base case: simply return when asked to free NULL nodes.
|
||||
if ( cntl == NULL ) return;
|
||||
@@ -112,7 +122,7 @@ void bli_cntl_free
|
||||
{
|
||||
// Recursively free all memory associated with the sub-node and its
|
||||
// children.
|
||||
bli_cntl_free( cntl_sub_node, thread_sub_node );
|
||||
bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node );
|
||||
}
|
||||
|
||||
// Free the current node's params field, if it is non-NULL.
|
||||
@@ -122,8 +132,8 @@ void bli_cntl_free
|
||||
}
|
||||
|
||||
// Release the current node's pack mem_t entry back to the memory
|
||||
// broker from which it originated, but only if the current thread
|
||||
// is chief for its group, and only if the mem_t is allocated.
|
||||
// broker from which it originated, but only if the mem_t entry is
|
||||
// allocated, and only if the current thread is chief for its group.
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
if ( bli_mem_is_alloc( cntl_pack_mem ) )
|
||||
{
|
||||
@@ -134,6 +144,42 @@ void bli_cntl_free
|
||||
bli_cntl_obj_free( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_free_wo_thrinfo
|
||||
(
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// Base case: simply return when asked to free NULL nodes.
|
||||
if ( cntl == NULL ) return;
|
||||
|
||||
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
|
||||
void* cntl_params = bli_cntl_params( cntl );
|
||||
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
|
||||
|
||||
{
|
||||
// Recursively free all memory associated with the sub-node and its
|
||||
// children.
|
||||
bli_cntl_free_wo_thrinfo( cntl_sub_node );
|
||||
}
|
||||
|
||||
// Free the current node's params field, if it is non-NULL.
|
||||
if ( cntl_params != NULL )
|
||||
{
|
||||
bli_free_intl( cntl_params );
|
||||
}
|
||||
|
||||
// Release the current node's pack mem_t entry back to the memory
|
||||
// broker from which it originated, but only if the mem_t entry is
|
||||
// allocated.
|
||||
if ( bli_mem_is_alloc( cntl_pack_mem ) )
|
||||
{
|
||||
bli_membrk_release( cntl_pack_mem );
|
||||
}
|
||||
|
||||
// Free the current node.
|
||||
bli_cntl_obj_free( cntl );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_cntl_copy
|
||||
|
||||
@@ -75,12 +75,25 @@ void bli_cntl_obj_clear
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntl_free
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_cntl_free_w_thrinfo
|
||||
(
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_cntl_free_wo_thrinfo
|
||||
(
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
cntl_t* bli_cntl_copy
|
||||
(
|
||||
cntl_t* cntl
|
||||
|
||||
@@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx )
|
||||
return bli_cntx_method( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx )
|
||||
pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_a( cntx );
|
||||
return bli_cntx_schema_a_block( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
|
||||
pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_b( cntx );
|
||||
return bli_cntx_schema_b_panel( cntx );
|
||||
}
|
||||
|
||||
pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_schema_c_panel( cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx )
|
||||
{
|
||||
return bli_cntx_anti_pref( cntx );
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -386,27 +396,27 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
|
||||
void
|
||||
bli_cntx_set_blkszs(
|
||||
void bli_cntx_set_blkszs
|
||||
(
|
||||
ind_t method = BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
|
||||
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
|
||||
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
ind_t method = BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
|
||||
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
|
||||
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
|
||||
...
|
||||
cntx_t* cntx );
|
||||
|
||||
void
|
||||
bli_cntx_set_blkszs(
|
||||
|
||||
ind_t method != BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t scalr0,
|
||||
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t scalr1,
|
||||
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t scalr2,
|
||||
...
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_blkszs
|
||||
(
|
||||
ind_t method != BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
|
||||
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
|
||||
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
va_list args;
|
||||
dim_t i;
|
||||
@@ -414,7 +424,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
bszid_t* bszids;
|
||||
blksz_t** blkszs;
|
||||
bszid_t* bmults;
|
||||
dim_t* scalrs;
|
||||
double* dsclrs;
|
||||
double* msclrs;
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
@@ -426,7 +437,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
|
||||
blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) );
|
||||
bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
|
||||
scalrs = bli_malloc_intl( n_bs * sizeof( dim_t ) );
|
||||
dsclrs = bli_malloc_intl( n_bs * sizeof( double ) );
|
||||
msclrs = bli_malloc_intl( n_bs * sizeof( double ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
@@ -444,9 +456,9 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// - the address of the blksz_t object, and
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
const bszid_t bs_id = va_arg( args, bszid_t );
|
||||
blksz_t* blksz = va_arg( args, blksz_t* );
|
||||
const bszid_t bm_id = va_arg( args, bszid_t );
|
||||
bszid_t bs_id = va_arg( args, bszid_t );
|
||||
blksz_t* blksz = va_arg( args, blksz_t* );
|
||||
bszid_t bm_id = va_arg( args, bszid_t );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
bszids[ i ] = bs_id;
|
||||
@@ -464,18 +476,21 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// - the address of the blksz_t object, and
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
// - the scalar we wish to apply to the real blocksizes to
|
||||
// come up with the induced complex blocksizes.
|
||||
const bszid_t bs_id = va_arg( args, bszid_t );
|
||||
blksz_t* blksz = va_arg( args, blksz_t* );
|
||||
const bszid_t bm_id = va_arg( args, bszid_t );
|
||||
const dim_t scalr = va_arg( args, dim_t );
|
||||
// - the scalars we wish to apply to the real blocksizes to
|
||||
// come up with the induced complex blocksizes (for default
|
||||
// and maximum blocksizes).
|
||||
bszid_t bs_id = va_arg( args, bszid_t );
|
||||
blksz_t* blksz = va_arg( args, blksz_t* );
|
||||
bszid_t bm_id = va_arg( args, bszid_t );
|
||||
double dsclr = va_arg( args, double );
|
||||
double msclr = va_arg( args, double );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
bszids[ i ] = bs_id;
|
||||
blkszs[ i ] = blksz;
|
||||
bmults[ i ] = bm_id;
|
||||
scalrs[ i ] = scalr;
|
||||
dsclrs[ i ] = dsclr;
|
||||
msclrs[ i ] = msclr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -510,12 +525,12 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
// Read the current blocksize id, blksz_t* pointer, blocksize
|
||||
// multiple id, and blocksize scalar.
|
||||
const bszid_t bs_id = bszids[ i ];
|
||||
const bszid_t bm_id = bmults[ i ];
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
|
||||
blksz_t* blksz = blkszs[ i ];
|
||||
blksz_t* blksz = blkszs[ i ];
|
||||
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
|
||||
// Copy the blksz_t object contents into the appropriate
|
||||
// location within the context's blksz_t array. Do the same
|
||||
@@ -534,14 +549,15 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
// Read the current blocksize id, blksz_t pointer, blocksize
|
||||
// multiple id, and blocksize scalar.
|
||||
const bszid_t bs_id = bszids[ i ];
|
||||
const bszid_t bm_id = bmults[ i ];
|
||||
const dim_t scalr = scalrs[ i ];
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
double dsclr = dsclrs[ i ];
|
||||
double msclr = msclrs[ i ];
|
||||
|
||||
blksz_t* blksz = blkszs[ i ];
|
||||
blksz_t* bmult = blkszs[ i ];
|
||||
blksz_t* blksz = blkszs[ i ];
|
||||
blksz_t* bmult = blkszs[ i ];
|
||||
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
|
||||
// Copy the real domain values of the source blksz_t object into
|
||||
// the context, duplicating into the complex domain fields.
|
||||
@@ -550,20 +566,50 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
// The next steps apply only to cache blocksizes, and not register
|
||||
// blocksizes (ie: they only apply to blocksizes for which the
|
||||
// blocksize multiple id is different than the blocksize id) and
|
||||
// only when the scalar provided is non-unit.
|
||||
if ( bs_id != bm_id && scalr != 1 )
|
||||
// If the default blocksize scalar is non-unit, we need to scale
|
||||
// the complex domain default blocksizes.
|
||||
if ( dsclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain values in the blocksize object.
|
||||
bli_blksz_scale_dt_by( 1, scalr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_dt_by( 1, scalr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Scale the complex domain default blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
// Finally, round the newly-scaled blocksizes down to their
|
||||
// respective multiples.
|
||||
bli_blksz_reduce_dt_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_dt_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
if ( bs_id != bm_id )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
// (Note that both the default and maximum blocksize values
|
||||
// must be a multiple of the same blocksize multiple.) Also,
|
||||
// note that this is only done when the blocksize id is not
|
||||
// equal to the blocksize multiple id (ie: we don't round
|
||||
// down scaled register blocksizes since they are their own
|
||||
// multiples).
|
||||
bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
}
|
||||
}
|
||||
|
||||
// Similarly, if the maximum blocksize scalar is non-unit, we need
|
||||
// to scale the complex domain maximum blocksizes.
|
||||
if ( msclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain maximum blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
if ( bs_id != bm_id )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
// (Note that both the default and maximum blocksize values
|
||||
// must be a multiple of the same blocksize multiple.) Also,
|
||||
// note that this is only done when the blocksize id is not
|
||||
// equal to the blocksize multiple id (ie: we don't round
|
||||
// down scaled register blocksizes since they are their own
|
||||
// multiples).
|
||||
bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the blocksize multiple id into the context.
|
||||
@@ -575,7 +621,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
bli_free_intl( blkszs );
|
||||
bli_free_intl( bszids );
|
||||
bli_free_intl( bmults );
|
||||
bli_free_intl( scalrs );
|
||||
bli_free_intl( dsclrs );
|
||||
bli_free_intl( msclrs );
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -668,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method,
|
||||
bli_cntx_set_method( method, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_a( schema_a, cntx );
|
||||
bli_cntx_set_schema_b( schema_b, cntx );
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_a( pack_t schema_a,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_a( schema_a, cntx );
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_b( pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_b( schema_b, cntx );
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_c( pack_t schema_c,
|
||||
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_c_panel( schema_c, cntx );
|
||||
}
|
||||
|
||||
#if 0
|
||||
void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_set_schema_c( schema_c, cntx );
|
||||
bli_cntx_set_anti_pref( anti_pref, cntx );
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
dim_t m, dim_t n, dim_t k )
|
||||
@@ -729,12 +784,20 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
}
|
||||
}
|
||||
|
||||
jc = bli_env_read_nway( "BLIS_JC_NT", jc );
|
||||
//pc = bli_env_read_nway( "BLIS_KC_NT", 1 );
|
||||
pc = 1;
|
||||
ic = bli_env_read_nway( "BLIS_IC_NT", ic );
|
||||
jr = bli_env_read_nway( "BLIS_JR_NT", jr );
|
||||
ir = bli_env_read_nway( "BLIS_IR_NT", ir );
|
||||
pc = 1;
|
||||
|
||||
dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 );
|
||||
dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 );
|
||||
dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 );
|
||||
dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 );
|
||||
|
||||
if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1)
|
||||
{
|
||||
jc = (jc_env == -1 ? 1 : jc_env);
|
||||
ic = (ic_env == -1 ? 1 : ic_env);
|
||||
jr = (jr_env == -1 ? 1 : jr_env);
|
||||
ir = (ir_env == -1 ? 1 : ir_env);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@@ -867,6 +930,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
@@ -916,6 +1005,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_print( cntx_t* cntx )
|
||||
|
||||
@@ -59,6 +59,8 @@ typedef struct cntx_s
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t* thrloop;
|
||||
|
||||
membrk_t* membrk;
|
||||
@@ -113,26 +115,30 @@ typedef struct cntx_s
|
||||
\
|
||||
( (cntx)->method )
|
||||
|
||||
#define bli_cntx_schema_a( cntx ) \
|
||||
#define bli_cntx_schema_a_block( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_a )
|
||||
( (cntx)->schema_a_block )
|
||||
|
||||
#define bli_cntx_schema_b( cntx ) \
|
||||
#define bli_cntx_schema_b_panel( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_b )
|
||||
( (cntx)->schema_b_panel )
|
||||
|
||||
#define bli_cntx_schema_c( cntx ) \
|
||||
#define bli_cntx_schema_c_panel( cntx ) \
|
||||
\
|
||||
( (cntx)->schema_c )
|
||||
( (cntx)->schema_c_panel )
|
||||
|
||||
#define bli_cntx_membrk( cntx ) \
|
||||
#define bli_cntx_anti_pref( cntx ) \
|
||||
\
|
||||
( (cntx)->membrk )
|
||||
( (cntx)->anti_pref )
|
||||
|
||||
#define bli_cntx_thrloop( cntx ) \
|
||||
\
|
||||
( (cntx)->thrloop )
|
||||
|
||||
#define bli_cntx_membrk( cntx ) \
|
||||
\
|
||||
( (cntx)->membrk )
|
||||
|
||||
#if 1
|
||||
#define bli_cntx_jc_way( cntx ) \
|
||||
\
|
||||
@@ -211,24 +217,24 @@ typedef struct cntx_s
|
||||
(cntx_p)->method = _method; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
|
||||
#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_a = _schema_a; \
|
||||
(cntx_p)->schema_a_block = _schema_a_block; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
|
||||
#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_b = _schema_b; \
|
||||
(cntx_p)->schema_b_panel = _schema_b_panel; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
|
||||
#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_c = _schema_c; \
|
||||
(cntx_p)->schema_c_panel = _schema_c_panel; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
|
||||
#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->membrk = _membrk; \
|
||||
(cntx_p)->anti_pref = _anti_pref; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
|
||||
@@ -241,6 +247,11 @@ typedef struct cntx_s
|
||||
(cntx_p)->thrloop[ BLIS_KR ] = 1; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->membrk = _membrk; \
|
||||
}
|
||||
|
||||
// cntx_t query (complex)
|
||||
|
||||
#define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
|
||||
@@ -323,13 +334,17 @@ typedef struct cntx_s
|
||||
\
|
||||
bli_cntx_method( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_a( cntx ) \
|
||||
#define bli_cntx_get_pack_schema_a_block( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_a( cntx )
|
||||
bli_cntx_schema_a_block( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_b( cntx ) \
|
||||
#define bli_cntx_get_pack_schema_b_panel( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_b( cntx )
|
||||
bli_cntx_schema_b_panel( cntx )
|
||||
|
||||
#define bli_cntx_get_pack_schema_c_panel( cntx ) \
|
||||
\
|
||||
bli_cntx_schema_c_panel( cntx )
|
||||
|
||||
#define bli_cntx_get_membrk( cntx ) \
|
||||
\
|
||||
@@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
|
||||
// l1vkr_t ker_id,
|
||||
// cntx_t* cntx );
|
||||
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx );
|
||||
//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx );
|
||||
dim_t bli_cntx_get_num_threads( cntx_t* cntx );
|
||||
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );
|
||||
|
||||
@@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_ind_method( ind_t method,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_ab( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_a( pack_t schema_a,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_b( pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_c( pack_t schema_c,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
|
||||
cntx_t* cntx );
|
||||
//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
|
||||
// cntx_t* cntx );
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
|
||||
side_t side,
|
||||
cntx_t* cntx,
|
||||
@@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
|
||||
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
@@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
|
||||
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx );
|
||||
|
||||
// print function
|
||||
|
||||
@@ -488,13 +518,13 @@ void bli_cntx_print( cntx_t* cntx );
|
||||
// pointer is NULL. When initializing, the context address that should
|
||||
// be used (local or external) is assigned to cntx_p.
|
||||
|
||||
#define bli_cntx_init_local_if( opname, cntx, cntx_p ) \
|
||||
#define bli_cntx_init_local_if( opname, dt, cntx, cntx_p ) \
|
||||
\
|
||||
cntx_t _cntx_l; \
|
||||
\
|
||||
if ( bli_is_null( cntx ) ) \
|
||||
{ \
|
||||
PASTEMAC(opname,_cntx_init)( &_cntx_l ); \
|
||||
PASTEMAC(opname,_cntx_init)( dt, &_cntx_l ); \
|
||||
cntx_p = &_cntx_l; \
|
||||
} \
|
||||
else \
|
||||
@@ -510,13 +540,13 @@ void bli_cntx_print( cntx_t* cntx );
|
||||
}
|
||||
|
||||
|
||||
#define bli_cntx_init_local_if2( opname, suf, cntx, cntx_p ) \
|
||||
#define bli_cntx_init_local_if2( opname, suf, dt, cntx, cntx_p ) \
|
||||
\
|
||||
cntx_t _cntx_l; \
|
||||
\
|
||||
if ( bli_is_null( cntx ) ) \
|
||||
{ \
|
||||
PASTEMAC2(opname,suf,_cntx_init)( &_cntx_l ); \
|
||||
PASTEMAC2(opname,suf,_cntx_init)( dt, &_cntx_l ); \
|
||||
cntx_p = &_cntx_l; \
|
||||
} \
|
||||
else \
|
||||
|
||||
@@ -94,48 +94,47 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
|
||||
void
|
||||
bli_gks_cntx_set_blkszs(
|
||||
void bli_gks_cntx_set_blkszs
|
||||
(
|
||||
ind_t method = BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, bszid_t bm0_id,
|
||||
bszid_t bs1_id, bszid_t bm1_id,
|
||||
bszid_t bs2_id, bszid_t bm2_id,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
ind_t method = BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, bszid_t bm0_id,
|
||||
bszid_t bs1_id, bszid_t bm1_id,
|
||||
bszid_t bs2_id, bszid_t bm2_id,
|
||||
...
|
||||
cntx_t* cntx );
|
||||
|
||||
void
|
||||
bli_gks_cntx_set_blkszs(
|
||||
|
||||
ind_t method != BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, bszid_t bm0_id, dim_t scalr0,
|
||||
bszid_t bs1_id, bszid_t bm1_id, dim_t scalr1,
|
||||
bszid_t bs2_id, bszid_t bm2_id, dim_t scalr2,
|
||||
...
|
||||
cntx_t* cntx );
|
||||
void bli_gks_cntx_set_blkszs
|
||||
(
|
||||
ind_t method != BLIS_NAT,
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
|
||||
bszid_t bs1_id, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
|
||||
bszid_t bs2_id, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
va_list args;
|
||||
dim_t i;
|
||||
|
||||
bszid_t* bszids;
|
||||
bszid_t* bmults;
|
||||
double* scalrs;
|
||||
double* dsclrs;
|
||||
double* msclrs;
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
blksz_t* cntx_blkszs;
|
||||
bszid_t* cntx_bmults;
|
||||
|
||||
bszid_t bs_id;
|
||||
bszid_t bm_id;
|
||||
double scalr;
|
||||
|
||||
// Allocate some temporary local arrays.
|
||||
bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
|
||||
bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
|
||||
scalrs = bli_malloc_intl( n_bs * sizeof( double ) );
|
||||
dsclrs = bli_malloc_intl( n_bs * sizeof( double ) );
|
||||
msclrs = bli_malloc_intl( n_bs * sizeof( double ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
@@ -152,8 +151,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// - the bszid_t of the blocksize we're about to process,
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
bs_id = va_arg( args, bszid_t );
|
||||
bm_id = va_arg( args, bszid_t );
|
||||
bszid_t bs_id = va_arg( args, bszid_t );
|
||||
bszid_t bm_id = va_arg( args, bszid_t );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
bszids[ i ] = bs_id;
|
||||
@@ -169,16 +168,19 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// - the bszid_t of the blocksize we're about to process,
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
// - the scalar we wish to apply to the real blocksizes to
|
||||
// come up with the induced complex blocksizes.
|
||||
bs_id = va_arg( args, bszid_t );
|
||||
bm_id = va_arg( args, bszid_t );
|
||||
scalr = va_arg( args, double );
|
||||
// - the scalars we wish to apply to the real blocksizes to
|
||||
// come up with the induced complex blocksizes (for default
|
||||
// and maximum blocksizes).
|
||||
bszid_t bs_id = va_arg( args, bszid_t );
|
||||
bszid_t bm_id = va_arg( args, bszid_t );
|
||||
double dsclr = va_arg( args, double );
|
||||
double msclr = va_arg( args, double );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
bszids[ i ] = bs_id;
|
||||
bmults[ i ] = bm_id;
|
||||
scalrs[ i ] = scalr;
|
||||
dsclrs[ i ] = dsclr;
|
||||
msclrs[ i ] = msclr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,10 +212,10 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
for ( i = 0; i < n_bs; ++i )
|
||||
{
|
||||
// Read the current blocksize id, blocksize multiple id.
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
|
||||
// Query the blocksizes (blksz_t) associated with bs_id and save
|
||||
// them directly into the appropriate location in the context's
|
||||
@@ -231,41 +233,75 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
// Read the current blocksize id, blocksize multiple id,
|
||||
// and blocksize scalar.
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
double scalr = scalrs[ i ];
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
bszid_t bm_id = bmults[ i ];
|
||||
double dsclr = dsclrs[ i ];
|
||||
double msclr = msclrs[ i ];
|
||||
|
||||
blksz_t blksz;
|
||||
blksz_t bmult;
|
||||
blksz_t blksz_l;
|
||||
blksz_t bmult_l;
|
||||
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
blksz_t* blksz = &blksz_l;
|
||||
blksz_t* bmult = &bmult_l;
|
||||
|
||||
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
|
||||
|
||||
// Query the blocksizes (blksz_t) associated with bs_id and bm_id
|
||||
// and use them to populate a pair of local blksz_t objects.
|
||||
bli_gks_get_blksz( bs_id, &blksz );
|
||||
bli_gks_get_blksz( bm_id, &bmult );
|
||||
bli_gks_get_blksz( bs_id, blksz );
|
||||
bli_gks_get_blksz( bm_id, bmult );
|
||||
|
||||
// Copy the real domain values of the source blksz_t object into
|
||||
// the context, duplicating into the complex domain fields.
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_FLOAT, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DOUBLE, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DCOMPLEX, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
// The next steps apply only to cache blocksizes, and not register
|
||||
// blocksizes (ie: they only apply to blocksizes for which the
|
||||
// blocksize multiple id is different than the blocksize id) and
|
||||
// only when the scalar provided is non-unit.
|
||||
if ( bs_id != bm_id && scalr != 1.0 )
|
||||
// If the default blocksize scalar is non-unit, we need to scale
|
||||
// the complex domain default blocksizes.
|
||||
if ( dsclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain values in the blocksize object.
|
||||
bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
// Scale the complex domain default blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
// Finally, round the newly-scaled blocksizes down to their
|
||||
// respective multiples.
|
||||
bli_blksz_reduce_dt_to( BLIS_FLOAT, &bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_dt_to( BLIS_DOUBLE, &bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
if ( bs_id != bm_id )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
// (Note that both the default and maximum blocksize values
|
||||
// must be a multiple of the same blocksize multiple.) Also,
|
||||
// note that this is only done when the blocksize id is not
|
||||
// equal to the blocksize multiple id (ie: we don't round
|
||||
// down scaled register blocksizes since they are their own
|
||||
// multiples).
|
||||
bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
}
|
||||
}
|
||||
|
||||
// Similarly, if the maximum blocksize scalar is non-unit, we need
|
||||
// to scale the complex domain maximum blocksizes.
|
||||
if ( msclr != 1.0 )
|
||||
{
|
||||
// Scale the complex domain maximum blocksize values in the
|
||||
// blocksize object.
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
|
||||
|
||||
if ( bs_id != bm_id )
|
||||
{
|
||||
// Round the newly-scaled blocksizes down to their multiple.
|
||||
// (Note that both the default and maximum blocksize values
|
||||
// must be a multiple of the same blocksize multiple.) Also,
|
||||
// note that this is only done when the blocksize id is not
|
||||
// equal to the blocksize multiple id (ie: we don't round
|
||||
// down scaled register blocksizes since they are their own
|
||||
// multiples).
|
||||
bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
|
||||
bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the blocksize multiple id into the context.
|
||||
@@ -276,7 +312,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// Free the temporary local arrays.
|
||||
bli_free_intl( bszids );
|
||||
bli_free_intl( bmults );
|
||||
bli_free_intl( scalrs );
|
||||
bli_free_intl( dsclrs );
|
||||
bli_free_intl( msclrs );
|
||||
}
|
||||
|
||||
|
||||
@@ -337,6 +374,18 @@ static func_t bli_gks_l3_ind_ukrs[BLIS_NUM_IND_METHODS]
|
||||
/* trsm_l */ { { NULL, BLIS_CTRSM4M1_L_UKERNEL, NULL, BLIS_ZTRSM4M1_L_UKERNEL, } },
|
||||
/* trsm_u */ { { NULL, BLIS_CTRSM4M1_U_UKERNEL, NULL, BLIS_ZTRSM4M1_U_UKERNEL, } },
|
||||
},
|
||||
/* 1m */ {
|
||||
/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM1M_UKERNEL,
|
||||
BLIS_DGEMM_UKERNEL, BLIS_ZGEMM1M_UKERNEL, } },
|
||||
/* gemmtrsm_l */ { { NULL, BLIS_CGEMMTRSM1M_L_UKERNEL,
|
||||
NULL, BLIS_ZGEMMTRSM1M_L_UKERNEL, } },
|
||||
/* gemmtrsm_u */ { { NULL, BLIS_CGEMMTRSM1M_U_UKERNEL,
|
||||
NULL, BLIS_ZGEMMTRSM1M_U_UKERNEL, } },
|
||||
/* trsm_l */ { { NULL, BLIS_CTRSM1M_L_UKERNEL,
|
||||
NULL, BLIS_ZTRSM1M_L_UKERNEL, } },
|
||||
/* trsm_u */ { { NULL, BLIS_CTRSM1M_U_UKERNEL,
|
||||
NULL, BLIS_ZTRSM1M_U_UKERNEL, } },
|
||||
},
|
||||
/* nat */ {
|
||||
/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM_UKERNEL,
|
||||
BLIS_DGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL, } },
|
||||
@@ -557,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
|
||||
mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ];
|
||||
|
||||
bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref );
|
||||
|
||||
// Explicitly set the anti-preference to FALSE.
|
||||
bli_cntx_set_anti_pref( FALSE, cntx );
|
||||
}
|
||||
|
||||
|
||||
@@ -565,6 +617,8 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr,
|
||||
// -- packm structure-aware kernel structure -----------------------------------
|
||||
//
|
||||
|
||||
// IF ENABLED: NEEDS UPDATING FOR 1M.
|
||||
|
||||
static func_t bli_gks_packm_struc_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
{
|
||||
/* float (0) scomplex (1) double (2) dcomplex (3) */
|
||||
|
||||
@@ -61,8 +61,10 @@ void bli_memsys_init( void )
|
||||
if ( bli_memsys_is_init == TRUE ) return;
|
||||
|
||||
// Create and initialize a context for gemm so we have something
|
||||
// to pass into bli_membrk_init_pools().
|
||||
bli_gemm_cntx_init( &cntx );
|
||||
// to pass into bli_membrk_init_pools(). We use BLIS_DOUBLE for
|
||||
// the datatype, but the dt argument is actually only used when
|
||||
// initializing contexts for induced methods.
|
||||
bli_gemm_cntx_init( BLIS_DOUBLE, &cntx );
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
_Pragma( "omp critical (mem)" )
|
||||
|
||||
@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
|
||||
(obj).n_panel = n0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dims( m0, n0, obj ) \
|
||||
{ \
|
||||
bli_obj_set_panel_length( m0, obj ); \
|
||||
bli_obj_set_panel_width( n0, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dim( panel_dim, obj ) \
|
||||
{ \
|
||||
(obj).pd = panel_dim; \
|
||||
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
|
||||
#define bli_obj_induce_trans( obj ) \
|
||||
{ \
|
||||
{ \
|
||||
/* Induce transposition among basic fields. */ \
|
||||
dim_t m_ = bli_obj_length( obj ); \
|
||||
dim_t n_ = bli_obj_width( obj ); \
|
||||
inc_t rs_ = bli_obj_row_stride( obj ); \
|
||||
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
|
||||
\
|
||||
if ( bli_obj_is_upper_or_lower( obj ) ) \
|
||||
bli_obj_toggle_uplo( obj ); \
|
||||
\
|
||||
/* Induce transposition among packed fields. */ \
|
||||
dim_t m_padded_ = bli_obj_padded_length( obj ); \
|
||||
dim_t n_padded_ = bli_obj_padded_width( obj ); \
|
||||
dim_t m_panel_ = bli_obj_panel_length( obj ); \
|
||||
dim_t n_panel_ = bli_obj_panel_width( obj ); \
|
||||
\
|
||||
bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
|
||||
bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
|
||||
\
|
||||
/* Note that this macro DOES NOT touch the transposition bit! If
|
||||
the calling code is using this macro to handle an object whose
|
||||
|
||||
@@ -654,6 +654,19 @@
|
||||
bli_is_io_packed( schema ) || \
|
||||
bli_is_rpi_packed( schema ) )
|
||||
|
||||
#define bli_is_1r_packed( schema ) \
|
||||
\
|
||||
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R )
|
||||
|
||||
#define bli_is_1e_packed( schema ) \
|
||||
\
|
||||
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E )
|
||||
|
||||
#define bli_is_1m_packed( schema ) \
|
||||
\
|
||||
( bli_is_1r_packed( schema ) || \
|
||||
bli_is_1e_packed( schema ) )
|
||||
|
||||
#define bli_is_nat_packed( schema ) \
|
||||
\
|
||||
( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 )
|
||||
|
||||
@@ -225,6 +225,43 @@
|
||||
#include "bli_scal2jrpis.h"
|
||||
|
||||
|
||||
// -- 1m-specific scalar macros --
|
||||
|
||||
#include "bli_invert1ms_mxn_diag.h"
|
||||
|
||||
#include "bli_scal1ms_mxn.h"
|
||||
|
||||
#include "bli_scal21ms_mxn_diag.h"
|
||||
#include "bli_scal21ms_mxn_uplo.h"
|
||||
|
||||
#include "bli_set1ms_mxn.h"
|
||||
#include "bli_set1ms_mxn_diag.h"
|
||||
#include "bli_set1ms_mxn_uplo.h"
|
||||
#include "bli_seti01ms_mxn_diag.h"
|
||||
|
||||
// 1e
|
||||
#include "bli_copy1es.h"
|
||||
#include "bli_copyj1es.h"
|
||||
|
||||
#include "bli_invert1es.h"
|
||||
|
||||
#include "bli_scal1es.h"
|
||||
|
||||
#include "bli_scal21es.h"
|
||||
#include "bli_scal2j1es.h"
|
||||
|
||||
// 1r
|
||||
#include "bli_copy1rs.h"
|
||||
#include "bli_copyj1rs.h"
|
||||
|
||||
#include "bli_invert1rs.h"
|
||||
|
||||
#include "bli_scal1rs.h"
|
||||
|
||||
#include "bli_scal21rs.h"
|
||||
#include "bli_scal2j1rs.h"
|
||||
|
||||
|
||||
|
||||
// -- Miscellaneous macros --
|
||||
|
||||
|
||||
@@ -224,6 +224,10 @@ typedef dcomplex f77_dcomplex;
|
||||
- 1 0110 11: packed imag-only column panels
|
||||
- 1 0111 10: packed real+imag row panels
|
||||
- 1 0111 11: packed real+imag column panels
|
||||
- 1 1000 10: packed by 1m expanded row panels
|
||||
- 1 1000 11: packed by 1m expanded column panels
|
||||
- 1 1001 10: packed by 1m reordered row panels
|
||||
- 1 1001 11: packed by 1m reordered column panels
|
||||
23 Packed panel order if upper-stored
|
||||
- 0 == forward order if upper
|
||||
- 1 == reverse order if upper
|
||||
@@ -329,6 +333,8 @@ typedef dcomplex f77_dcomplex;
|
||||
#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT )
|
||||
#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT )
|
||||
#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT )
|
||||
#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT )
|
||||
#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT )
|
||||
#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT )
|
||||
#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT )
|
||||
#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT )
|
||||
@@ -348,6 +354,10 @@ typedef dcomplex f77_dcomplex;
|
||||
#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
|
||||
#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT )
|
||||
#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
|
||||
#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT )
|
||||
#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
|
||||
#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT )
|
||||
#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
|
||||
#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0
|
||||
#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT
|
||||
#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0
|
||||
@@ -469,13 +479,17 @@ typedef enum
|
||||
BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO,
|
||||
BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI,
|
||||
BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI,
|
||||
BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E,
|
||||
BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R,
|
||||
} pack_t;
|
||||
|
||||
// We combine row and column packing into one "type", and we start
|
||||
// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the
|
||||
// schema pair for "4ms" (4m separated), because its bit value has
|
||||
// been reserved, even though we don't use it.
|
||||
#define BLIS_NUM_PACK_SCHEMA_TYPES 8
|
||||
#define BLIS_NUM_PACK_SCHEMA_TYPES 10
|
||||
|
||||
|
||||
// -- Pack order type --
|
||||
@@ -575,6 +589,7 @@ typedef enum
|
||||
BLIS_4MH,
|
||||
BLIS_4M1B,
|
||||
BLIS_4M1A,
|
||||
BLIS_1M,
|
||||
BLIS_NAT,
|
||||
} ind_t;
|
||||
|
||||
@@ -960,9 +975,11 @@ typedef struct cntx_s
|
||||
|
||||
opid_t family;
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
pack_t schema_a_block;
|
||||
pack_t schema_b_panel;
|
||||
pack_t schema_c_panel;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
||||
|
||||
|
||||
53
frame/include/level0/1e/bli_copy1es.h
Normal file
53
frame/include/level0/1e/bli_copy1es.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_COPY1ES_H
|
||||
#define BLIS_COPY1ES_H
|
||||
|
||||
// copy1es
|
||||
|
||||
#define bli_ccopy1es( a, bri, bir ) \
|
||||
{ \
|
||||
bli_ccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
|
||||
bli_ccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
|
||||
}
|
||||
|
||||
#define bli_zcopy1es( a, bri, bir ) \
|
||||
{ \
|
||||
bli_zcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
|
||||
bli_zcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
53
frame/include/level0/1e/bli_copyj1es.h
Normal file
53
frame/include/level0/1e/bli_copyj1es.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_COPYJ1ES_H
|
||||
#define BLIS_COPYJ1ES_H
|
||||
|
||||
// copyj1es
|
||||
|
||||
#define bli_ccopyj1es( a, bri, bir ) \
|
||||
{ \
|
||||
bli_ccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \
|
||||
bli_ccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \
|
||||
}
|
||||
|
||||
#define bli_zcopyj1es( a, bri, bir ) \
|
||||
{ \
|
||||
bli_zcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \
|
||||
bli_zcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
53
frame/include/level0/1e/bli_invert1es.h
Normal file
53
frame/include/level0/1e/bli_invert1es.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_INVERT1ES_H
|
||||
#define BLIS_INVERT1ES_H
|
||||
|
||||
// invert1es
|
||||
|
||||
#define bli_cinvert1es( bri, bir ) \
|
||||
{ \
|
||||
bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \
|
||||
bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \
|
||||
}
|
||||
|
||||
#define bli_zinvert1es( bri, bir ) \
|
||||
{ \
|
||||
bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \
|
||||
bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
53
frame/include/level0/1e/bli_scal1es.h
Normal file
53
frame/include/level0/1e/bli_scal1es.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL1ES_H
|
||||
#define BLIS_SCAL1ES_H
|
||||
|
||||
// scal1es
|
||||
|
||||
#define bli_cscal1es( a, yri, yir ) \
|
||||
{ \
|
||||
bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \
|
||||
bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_zscal1es( a, yri, yir ) \
|
||||
{ \
|
||||
bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \
|
||||
bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
65
frame/include/level0/1e/bli_scal21es.h
Normal file
65
frame/include/level0/1e/bli_scal21es.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL21ES_H
|
||||
#define BLIS_SCAL21ES_H
|
||||
|
||||
// scal21es
|
||||
|
||||
#define bli_cscal21es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
|
||||
bli_cscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_zscal21es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
|
||||
bli_zscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_scscal21es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
|
||||
bli_scscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_dzscal21es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
|
||||
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
65
frame/include/level0/1e/bli_scal2j1es.h
Normal file
65
frame/include/level0/1e/bli_scal2j1es.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL2J1ES_H
|
||||
#define BLIS_SCAL2J1ES_H
|
||||
|
||||
// scal2j1es
|
||||
|
||||
#define bli_cscal2j1es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
|
||||
bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_zscal2j1es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
|
||||
bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_scscal2j1es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \
|
||||
bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \
|
||||
}
|
||||
|
||||
#define bli_dzscal2j1es( a, x, yri, yir ) \
|
||||
{ \
|
||||
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \
|
||||
bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
126
frame/include/level0/1m/bli_invert1ms_mxn_diag.h
Normal file
126
frame/include/level0/1m/bli_invert1ms_mxn_diag.h
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_INVERT1MS_MXN_DIAG_H
|
||||
#define BLIS_INVERT1MS_MXN_DIAG_H
|
||||
|
||||
// invert1ms_mxn_diag
|
||||
|
||||
#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_off_ri = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y; \
|
||||
scomplex* restrict y_off_ir = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_off_r = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2; \
|
||||
float* restrict y_off_i = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_off_ri = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y; \
|
||||
dcomplex* restrict y_off_ir = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_off_r = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2; \
|
||||
double* restrict y_off_i = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
124
frame/include/level0/1m/bli_scal1ms_mxn.h
Normal file
124
frame/include/level0/1m/bli_scal1ms_mxn.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL1MS_MXN_H
|
||||
#define BLIS_SCAL1MS_MXN_H
|
||||
|
||||
// scal1ms_mxn
|
||||
|
||||
#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_ri = y; \
|
||||
scomplex* restrict y_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_r = y_cast; \
|
||||
float* restrict y_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_ri = y; \
|
||||
dcomplex* restrict y_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop,
|
||||
which steps in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_r = y_cast; \
|
||||
double* restrict y_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
126
frame/include/level0/1m/bli_scal21ms_mxn_diag.h
Normal file
126
frame/include/level0/1m/bli_scal21ms_mxn_diag.h
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL21MS_MXN_DIAG_H
|
||||
#define BLIS_SCAL21MS_MXN_DIAG_H
|
||||
|
||||
// scal21ms_mxn_diag
|
||||
|
||||
#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_off_ri = y; \
|
||||
scomplex* restrict y_off_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_scscal21es( *(x + i*rs_x + i*cs_x), \
|
||||
*(a), \
|
||||
*(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_off_r = y_cast; \
|
||||
float* restrict y_off_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_scscal21rs( *(x + i*rs_x + i*cs_x), \
|
||||
*(a), \
|
||||
*(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_off_ri = y; \
|
||||
dcomplex* restrict y_off_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_dzscal21es( *(x + i*rs_x + i*cs_x), \
|
||||
*(a), \
|
||||
*(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_off_r = y_cast; \
|
||||
double* restrict y_off_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_dzscal21rs( *(x + i*rs_x + i*cs_x), \
|
||||
*(a), \
|
||||
*(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
296
frame/include/level0/1m/bli_scal21ms_mxn_uplo.h
Normal file
296
frame/include/level0/1m/bli_scal21ms_mxn_uplo.h
Normal file
@@ -0,0 +1,296 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SCAL21MS_MXN_UPLO_H
|
||||
#define BLIS_SCAL21MS_MXN_UPLO_H
|
||||
|
||||
// scal21ms_mxn_uplo
|
||||
|
||||
#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_ri = y; \
|
||||
scomplex* restrict y_ir = y + ld_y/2; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal2j1es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal21es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_cscal2j1es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_cscal21es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_r = y_cast; \
|
||||
float* restrict y_i = y_cast + ld_y; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal2j1rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_cscal21rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_cscal2j1rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_cscal21rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_ri = y; \
|
||||
dcomplex* restrict y_ir = y + ld_y/2; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal2j1es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal21es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zscal2j1es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zscal21es( *(a), \
|
||||
*(x + i*rs_x + j*cs_x), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_r = y_cast; \
|
||||
double* restrict y_i = y_cast + ld_y; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal2j1rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zscal21rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zscal2j1rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conjx ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < m; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zscal21rs( *(a), \
|
||||
*(x + i*rs_x + j*cs_x ), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
164
frame/include/level0/1m/bli_set1ms_mxn.h
Normal file
164
frame/include/level0/1m/bli_set1ms_mxn.h
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SET1MS_MXN_H
|
||||
#define BLIS_SET1MS_MXN_H
|
||||
|
||||
// set1ms_mxn
|
||||
|
||||
#define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
inc_t offm_local = offm; \
|
||||
inc_t offn_local = offn; \
|
||||
dim_t m_local = m; \
|
||||
dim_t n_local = n; \
|
||||
inc_t rs_y1 = rs_y; \
|
||||
inc_t cs_y1 = cs_y; \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Optimization: The loops walk through y with unit stride if y is
|
||||
column-stored. If y is row-stored, swap the dimensions and strides
|
||||
to preserve unit stride movement. */ \
|
||||
if ( cs_y == 1 ) \
|
||||
{ \
|
||||
bli_swap_incs( offm_local, offn_local ); \
|
||||
bli_swap_dims( m_local, n_local ); \
|
||||
bli_swap_incs( rs_y1, cs_y1 ); \
|
||||
bli_swap_incs( rs_y2, cs_y2 ); \
|
||||
} \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \
|
||||
+ (offn_local )*cs_y1; \
|
||||
scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \
|
||||
+ (offn_local )*cs_y1 + ld_y/2; \
|
||||
\
|
||||
for ( j = 0; j < n_local; ++j ) \
|
||||
for ( i = 0; i < m_local; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1es( *(a), \
|
||||
*(y_off_ri + i*rs_y1 + j*cs_y1), \
|
||||
*(y_off_ir + i*rs_y1 + j*cs_y1) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_off_r = y_cast + (offm_local )*rs_y2 \
|
||||
+ (offn_local )*cs_y2; \
|
||||
float* restrict y_off_i = y_cast + (offm_local )*rs_y2 \
|
||||
+ (offn_local )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( j = 0; j < n_local; ++j ) \
|
||||
for ( i = 0; i < m_local; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1rs( *(a), \
|
||||
*(y_off_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
inc_t offm_local = offm; \
|
||||
inc_t offn_local = offn; \
|
||||
dim_t m_local = m; \
|
||||
dim_t n_local = n; \
|
||||
inc_t rs_y1 = rs_y; \
|
||||
inc_t cs_y1 = cs_y; \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Optimization: The loops walk through y with unit stride if y is
|
||||
column-stored. If y is row-stored, swap the dimensions and strides
|
||||
to preserve unit stride movement. */ \
|
||||
if ( cs_y == 1 ) \
|
||||
{ \
|
||||
bli_swap_incs( offm_local, offn_local ); \
|
||||
bli_swap_dims( m_local, n_local ); \
|
||||
bli_swap_incs( rs_y1, cs_y1 ); \
|
||||
bli_swap_incs( rs_y2, cs_y2 ); \
|
||||
} \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \
|
||||
+ (offn_local )*cs_y1; \
|
||||
dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \
|
||||
+ (offn_local )*cs_y1 + ld_y/2; \
|
||||
\
|
||||
for ( j = 0; j < n_local; ++j ) \
|
||||
for ( i = 0; i < m_local; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1es( *(a), \
|
||||
*(y_off_ri + i*rs_y1 + j*cs_y1), \
|
||||
*(y_off_ir + i*rs_y1 + j*cs_y1) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_off_r = y_cast + (offm_local )*rs_y2 \
|
||||
+ (offn_local )*cs_y2; \
|
||||
double* restrict y_off_i = y_cast + (offm_local )*rs_y2 \
|
||||
+ (offn_local )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( j = 0; j < n_local; ++j ) \
|
||||
for ( i = 0; i < m_local; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1rs( *(a), \
|
||||
*(y_off_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
130
frame/include/level0/1m/bli_set1ms_mxn_diag.h
Normal file
130
frame/include/level0/1m/bli_set1ms_mxn_diag.h
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SET1MS_MXN_DIAG_H
|
||||
#define BLIS_SET1MS_MXN_DIAG_H
|
||||
|
||||
// set1ms_mxn_diag
|
||||
|
||||
#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_off_ri = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y; \
|
||||
scomplex* restrict y_off_ir = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1es( *(a), \
|
||||
*(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_off_r = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2; \
|
||||
float* restrict y_off_i = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1rs( *(a), \
|
||||
*(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_off_ri = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y; \
|
||||
dcomplex* restrict y_off_ir = y + (offm )*rs_y \
|
||||
+ (offn )*cs_y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1es( *(a), \
|
||||
*(y_off_ri + i*rs_y + i*cs_y), \
|
||||
*(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_off_r = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2; \
|
||||
double* restrict y_off_i = y_cast + (offm )*rs_y2 \
|
||||
+ (offn )*cs_y2 + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1rs( *(a), \
|
||||
*(y_off_r + i*rs_y2 + i*cs_y2), \
|
||||
*(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
198
frame/include/level0/1m/bli_set1ms_mxn_uplo.h
Normal file
198
frame/include/level0/1m/bli_set1ms_mxn_uplo.h
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SET1MS_MXN_UPLO_H
|
||||
#define BLIS_SET1MS_MXN_UPLO_H
|
||||
|
||||
// set1ms_mxn_uplo
|
||||
|
||||
#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
doff_t diagoff_abs = bli_abs( diagoff ); \
|
||||
inc_t offdiag_inc; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
/* Set the off-diagonal increment. */ \
|
||||
if ( diagoff > 0 ) offdiag_inc = cs_y; \
|
||||
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
|
||||
\
|
||||
scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \
|
||||
scomplex* restrict y_ri = y0; \
|
||||
scomplex* restrict y_ir = y0 + ld_y/2; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
/* Set the off-diagonal increment. */ \
|
||||
if ( diagoff > 0 ) offdiag_inc = cs_y2; \
|
||||
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
|
||||
\
|
||||
float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \
|
||||
float* restrict y_r = y0; \
|
||||
float* restrict y_i = y0 + ld_y; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_ccopy1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
doff_t diagoff_abs = bli_abs( diagoff ); \
|
||||
inc_t offdiag_inc; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
/* Set the off-diagonal increment. */ \
|
||||
if ( diagoff > 0 ) offdiag_inc = cs_y; \
|
||||
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \
|
||||
\
|
||||
dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \
|
||||
dcomplex* restrict y_ri = y0; \
|
||||
dcomplex* restrict y_ir = y0 + ld_y/2; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1es( *(a), \
|
||||
*(y_ri + i*rs_y + j*cs_y), \
|
||||
*(y_ir + i*rs_y + j*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
/* Set the off-diagonal increment. */ \
|
||||
if ( diagoff > 0 ) offdiag_inc = cs_y2; \
|
||||
else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \
|
||||
\
|
||||
double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \
|
||||
double* restrict y_r = y0; \
|
||||
double* restrict y_i = y0 + ld_y; \
|
||||
\
|
||||
if ( bli_is_lower( uplo ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = j; i < m; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_upper( uplo ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < j + 1; ++i ) \
|
||||
{ \
|
||||
bli_zcopy1rs( *(a), \
|
||||
*(y_r + i*rs_y2 + j*cs_y2), \
|
||||
*(y_i + i*rs_y2 + j*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
114
frame/include/level0/1m/bli_seti01ms_mxn_diag.h
Normal file
114
frame/include/level0/1m/bli_seti01ms_mxn_diag.h
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SETI01MS_MXN_DIAG_H
|
||||
#define BLIS_SETI01MS_MXN_DIAG_H
|
||||
|
||||
// seti01ms_mxn_diag
|
||||
|
||||
#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
scomplex* restrict y_off_ri = y; \
|
||||
scomplex* restrict y_off_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
|
||||
bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
float* restrict y_cast = ( float* )y; \
|
||||
float* restrict y_off_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \
|
||||
{ \
|
||||
dim_t min_m_n = bli_min( m, n ); \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Handle 1e and 1r separately. */ \
|
||||
if ( bli_is_1e_packed( schema ) ) \
|
||||
{ \
|
||||
dcomplex* restrict y_off_ri = y; \
|
||||
dcomplex* restrict y_off_ir = y + ld_y/2; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \
|
||||
bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_1r_packed( schema ) ) */ \
|
||||
{ \
|
||||
inc_t rs_y2 = rs_y; \
|
||||
inc_t cs_y2 = cs_y; \
|
||||
\
|
||||
/* Scale the non-unit stride by two for the 1r loop, which steps
|
||||
in units of real (not complex) values. */ \
|
||||
if ( rs_y2 == 1 ) { cs_y2 *= 2; } \
|
||||
else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \
|
||||
\
|
||||
double* restrict y_cast = ( double* )y; \
|
||||
double* restrict y_off_i = y_cast + ld_y; \
|
||||
\
|
||||
for ( i = 0; i < min_m_n; ++i ) \
|
||||
{ \
|
||||
bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
51
frame/include/level0/1r/bli_copy1rs.h
Normal file
51
frame/include/level0/1r/bli_copy1rs.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_COPY1RS_H
|
||||
#define BLIS_COPY1RS_H
|
||||
|
||||
// copy1rs
|
||||
|
||||
#define bli_ccopy1rs( a, br, bi ) \
|
||||
{ \
|
||||
bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \
|
||||
}
|
||||
|
||||
#define bli_zcopy1rs( a, br, bi ) \
|
||||
{ \
|
||||
bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user