diff --git a/README.md b/README.md index 565efb09e..a6b69a5c3 100644 --- a/README.md +++ b/README.md @@ -169,10 +169,14 @@ performance remain attainable. * **A foundation for mixed domain and/or mixed precision operations.** BLIS was designed with the hope of one day allowing computation on real and complex operands within the same operation. Similarly, we wanted to allow mixing -operands' floating-point precisions, or both domain and precision. -While this feature is not yet implemented, we plan to prototype and explore -the potential for adding mixed domain, mixed precision support to operations -such as `gemm`. +operands' numerical domains, floating-point precisions, or both domain and +precision, and to optionally compute in a precision different than one or both +operands' storage precisions. This feature has been implemented for the general +matrix multiplication (`gemm`) operation, providing 128 different possible type +combinations, which, when combined with existing transposition, conjugation, +and storage parameters, enables 55,296 different `gemm` use cases. For more +details, please see the documentation on [mixed datatype](docs/MixedDatatypes.md) +support. Getting Started --------------- @@ -230,6 +234,9 @@ included in the BLIS source distribution. table of supported microarchitectures. * **[Multithreading](docs/Multithreading.md).** This document describes how to use the multithreading features of BLIS. + * **[Mixed-Datatype](docs/MixedDatatype.md).** This document provides an +overview of BLIS's mixed-datatype functionality and provides a brief example +of how to take advantage of this new code. * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of changes included with each new version of BLIS, along with contributor credits for key features. diff --git a/build/bli_config.h.in b/build/bli_config.h.in index 97b2fcca0..b7e5adf85 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -92,6 +92,26 @@ #endif #endif +#ifndef BLIS_ENABLE_MIXED_DT +#ifndef BLIS_DISABLE_MIXED_DT +#if @enable_mixed_dt@ +#define BLIS_ENABLE_MIXED_DT +#else +#define BLIS_DISABLE_MIXED_DT +#endif +#endif +#endif + +#ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM +#ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM +#if @enable_mixed_dt_extra_mem@ +#define BLIS_ENABLE_MIXED_DT_EXTRA_MEM +#else +#define BLIS_DISABLE_MIXED_DT_EXTRA_MEM +#endif +#endif +#endif + #if @enable_memkind@ #define BLIS_ENABLE_MEMKIND #else diff --git a/configure b/configure index 0a8f58c0a..e5c17fd5f 100755 --- a/configure +++ b/configure @@ -191,6 +191,24 @@ print_usage() echo " compatibility layer. This automatically enables the" echo " BLAS compatibility layer as well." echo " " + echo " --disable-mixed-dt, --enable-mixed-dt" + echo " " + echo " Disable (enabled by default) support for mixing the" + echo " storage domain and/or storage precision of matrix" + echo " operands for the gemm operation, as well as support" + echo " for computing in a precision different from one or." + echo " both of matrices A and B." + echo " " + echo " --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem" + echo " " + echo " Disable (enabled by default) support for additional" + echo " mixed datatype optimizations that require temporarily" + echo " allocating extra memory--specifically, a single m x n" + echo " matrix (per application thread) whose storage datatype" + echo " is equal to the computation datatype. This option may" + echo " only be enabled when mixed domain/precision support is" + echo " enabled." + echo " " echo " -s NAME --enable-sandbox=NAME" echo " " echo " Enable a separate sandbox implementation of gemm. This" @@ -1605,6 +1623,8 @@ main() blas_int_type_size=32 enable_blas='yes' enable_cblas='no' + enable_mixed_dt='yes' + enable_mixed_dt_extra_mem='yes' enable_memkind='' # The default memkind value is determined later on. force_version='no' @@ -1739,6 +1759,18 @@ main() disable-cblas) enable_cblas='no' ;; + enable-mixed-dt) + enable_mixed_dt='yes' + ;; + disable-mixed-dt) + enable_mixed_dt='no' + ;; + enable-mixed-dt-extra-mem) + enable_mixed_dt_extra_mem='yes' + ;; + disable-mixed-dt-extra-mem) + enable_mixed_dt_extra_mem='no' + ;; with-memkind) enable_memkind='yes' ;; @@ -2414,8 +2446,35 @@ main() echo "${script_name}: the CBLAS compatibility layer is disabled." enable_cblas_01=0 fi - - # Report integer sizes + if [ "x${enable_mixed_dt}" = "xyes" ]; then + echo "${script_name}: mixed datatype support is enabled." + + if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then + echo "${script_name}: mixed datatype optimizations requiring extra memory are enabled." + enable_mixed_dt_extra_mem_01=1 + else + echo "${script_name}: mixed datatype optimizations requiring extra memory are disabled." + enable_mixed_dt_extra_mem_01=0 + fi + + enable_mixed_dt_01=1 + else + echo "${script_name}: mixed datatype support is disabled." + + if [ "x${enable_mixed_dt_extra_mem}" = "xyes" ]; then + echo "${script_name}: *** Mixed datatype optimizations requiring extra memory are only" + echo "${script_name}: *** available when mixed datatype support is also enabled." + echo "${script_name}: *** Please enable mixed datatype support, or disable mixed datatype" + echo "${script_name}: *** optimizations requiring extra memory, and re-run configure." + exit 1 + else + enable_mixed_dt_extra_mem_01=0 + fi + + enable_mixed_dt_01=0 + fi + + # Report integer sizes. if [ "x${int_type_size}" = "x32" ]; then echo "${script_name}: the internal integer size is 32-bit." elif [ "x${int_type_size}" = "x64" ]; then @@ -2595,6 +2654,8 @@ main() | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ | sed -e "s/@enable_blas@/${enable_blas_01}/g" \ | sed -e "s/@enable_cblas@/${enable_cblas_01}/g" \ + | sed -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \ + | sed -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \ | sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \ | sed -e "s/@enable_shared@/${enable_shared_01}/g" \ diff --git a/docs/MixedDatatypes.md b/docs/MixedDatatypes.md new file mode 100644 index 000000000..90c2a8703 --- /dev/null +++ b/docs/MixedDatatypes.md @@ -0,0 +1,217 @@ +## Contents + +* **[Contents](MixedDatatypes.md#contents)** +* **[Introduction](MixedDatatypes.md#introduction)** +* **[Categories of mixed datatypes](MixedDatatypes.md#categories-of-mixed-datatypes)** + * **[Computation precision](MixedDatatypes.md#computation-precision)** + * **[Computation domain](MixedDatatypes.md#computation-domain)** +* **[Performing gemm with mixed datatypes](MixedDatatypes.md#performing-gemm-with-mixed-datatypes)** +* **[Known Issues](MixedDatatypes.md#known-issues)** +* **[Conclusion](MixedDatatypes.md#conclusion)** + +## Introduction + +This document serves as a guide to users interested in taking advantage of +BLIS's support for performing the `gemm` operation on operands of differing +types. + +## Categories of mixed datatypes + +Before going any further, we find it useful to categorize mixed datatype +support into four categories: + +1. **Fully identical datatypes.** This is what people generally think of when +they think about the `gemm` operation: all operands are stored in the same +datatype (precision and domain), and the matrix product computation is +performed in the arithmetic represented by that datatype. (This category +doesn't actually involve mixing datatypes, but it's still worthwhile to +define.) +Example: matrix C updated by the product of matrix A and matrix B +(all matrices double-precision real). + +2. **Mixed domain with identical precisions.** This category includes all +combinations of datatypes where the domain (real or complex) of each +operand may vary while the precisions (single or double precision) are +held constant across all operands. +Example: complex matrix C updated by the product of real matrix A and +complex matrix B (all matrices single-precision). + +3. **Mixed precision within a single domain.** Here, all operands are stored +in the same domain (real or complex), however, the precision of each operand +may vary. +Example: double-precision real matrix C updated by the product of +single-precision real matrix A and single-precision real matrix B. + +4. **Mixed precision and mixed domain.** This category allows both domains and +precision of each matrix operand to vary. +Example: double-precision complex matrix C updated by the product of +single-precision complex matrix A and single-precision real matrix B. + +BLIS's implementation of mixed-datatype `gemm` supports all combinations +within all four categories. + +### Computation precision + +Because categories 3 and 4 involve mixing precisions, they come with an added +parameter: the *computation precision*. This parameter specifies the precision +in which the matrix multiplication (product) takes place. This precision +can be different than the storage precision of matrices A or B, and/or the +storage precision of matrix C. + +When the computation precision differs from the storage precision of matrix A, +it implies that a typecast must occur when BLIS packs matrix A to contiguous +storage. Similarly, B may also need to be typecast during packing. + +When the computation precision differs from the storage precision of C, it +means the result of the matrix product A*B must be typecast just before it +is accumulated back into matrix C. + +### Computation domain + +In addition to the computation precision, we also track a computation domain. +(Together, they form the computation datatype.) However, for now we do not +allow the user to explicitly specify the computation domain. Instead, the +computation domain is implied by the domains of A, B, and C. The following +table enumerates the six cases where there is at least one operand of each +domain, along with the corresponding same-domain cases from category 1 for +reference. We also list the total number of floating-point operations +performed in each case. +In the table, an 'R' denotes a real domain matrix operand while a 'C' denotes +a matrix in the complex domain. The R's and C's appear in the following +format of C += A * B, where A, B, and C are the matrix operands of `gemm`. + +| Case # | Mixed domain case | Implied computation domain | flops performed | +|--------|:-----------------:|:--------------------------:|:---------------:| +| 1 | R += R * R | real | 2mnk | +| 2 | R += R * C | real | 2mnk | +| 3 | R += C * R | real | 2mnk | +| 4 | R += C * C | complex | 4mnk | +| 5 | C += R * R | real | 2mnk | +| 6 | C += R * C | complex | 4mnk | +| 7 | C += C * R | complex | 4mnk | +| 8 | C += C * C | complex | 8mnk | + +The computation domain is implied in cases 1 and 8 in the same way that +it would be if mixed datatype support were absent entirely. These +cases execute 2mnk and 8mnk flops, respectively, as any traditional +implementation would. + +In cases 2 and 3, we assume the computation domain is real because only +B or A, respectively, is complex. Thus, in these cases, the imaginary +components of the complex matrix are ignored, allowing us to perform +only 2mnk flops. + +In case 5, we take the computation domain to be real because A and B are +both real, and thus it makes no sense to compute in the complex domain. +This means that we need only update the real components of C, leaving +the imaginary components untouched. This also results in 2mnk flops +being performed. + +In case 4, we have complex A and B, allowing us to compute a complex +product. However, we can only save the real part of that complex product +since the output matrix C is real. Since we cannot update the imaginary +component of C (since it is not stored), we avoid computing that half of +the update entirely, reducing the flops performed to 4mnk. (Alternatively, +one may wish to request real domain computation, in which case the +imaginary components of A and B were ignored *prior* to computing the +matrix product. This approach would result in only 2mnk flops being +performed.) + +In case 6, we wish for both the real and imaginary parts of B to participate +in the multiplication by A, with the result updating the corresponding real +and imaginary parts of C. Granted, the imaginary part of A is zero, and this +is taken advantage of in the computation to optimize performance, as indicated +by the 4mnk flop count. But fundamentally this computation executes in the +complex domain because both the real and imaginary parts of C are updated. +A similar story can be told about case 7. + +## Performing gemm with mixed datatypes + +In BLIS, performing a mixed-datatype `gemm` operation is easy. However, +it will require that the user call `gemm` through BLIS's object API. +For a basic series of examples for using the object-based API, please +see the example codes in the `examples/oapi` directory of the BLIS source +distribution. + +The first step is to ensure that BLIS is configured with mixed datatype support. +Please consult with your current distribution's `configure` script for the +current semantics: +``` +$ ./configure --help +``` +As of this writing, mixed datatype support is enabled by default, and thus +no additional options are needed. + +With mixed datatype support enabled in BLIS, using the functionality is +simply a matter of creating and initializing matrices of different precisions +and/or domains. +```c +dim_t m = 5, n = 4, k = 2; +obj_t a, b, c; +obj_t* alpha; +obj_t* beta; + +bli_obj_create( BLIS_DOUBLE, m, k, 0, 0, &a ); +bli_obj_create( BLIS_FLOAT, k, n, 0, 0, &b ); +bli_obj_create( BLIS_SCOMPLEX, m, n, 0, 0, &c ); + +alpha = &BLIS_ONE; +beta = &BLIS_ONE; + +bli_randm( &a ); +bli_randm( &b ); +bli_randm( &c ); +``` +Then, you specify the computation precision by setting the computation +precision property of matrix C. +```c +bli_obj_set_comp_prec( BLIS_DOUBLE_PREC, &c ); +``` +If you do not explicitly specify the computation precision, it will default +to the *storage* precision of C. + +With the objects created and the computation precision specified, call +`bli_gemm()` just as you would if the datatypes were identical: +```c +bli_gemm( alpha, &a, &b, beta, &c ); +``` +For more examples of using BLIS's object-based API, including methods +of initializing an matrix object with arbitrary values, please review the +example code found in the `examples/oapi` directory of the BLIS source +distribution. + +## Known Issues + +While BLIS implements 128 mixed-datatype combinations of `gemm`, there may be +odd behavior in the current implementation that does not conform to the reader's +expectations. Below is a list of issues that BLIS developers are aware of in +the context of mixed-datatype `gemm`. If any of these issues poses a problem for +your application, please contact us by +[opening an issue](https://github.com/flame/blis/issues). + +* **alpha with non-zero imaginary components.** Currently, there are many cases +of mixed-datatype `gemm` that do not yet support computing with `alpha` scalars +that have non-zero imaginary components--in other words, values of `alpha` that +are not in the real domain. (By contrast, non-real values for `beta` are fully +supported.) In order to support these use cases, additional code complexity and +logic would be required. Thus, we have chosen, for now, to not implement them. +If mixed-datatype `gemm` is invoked with a non-real valued `alpha` scalar, a +runtime error message will be printed and the linked program will abort. + +* **Manually specifying the computation domain.** As mentioned in the section +discussing the [computation domain](MixedDatatype.md#computation-domain), +the computation domain of any case of mixed domain `gemm` is implied by the +operands and thus fixed; the user may not specify a different computation +domain, even if the mixed-domain case would reasonably allow for computing +in either domain. + +## Conclusion + +For more information and documentation on BLIS, please visit the [BLIS github page](https://github.com/flame/blis/). + +If you found a bug or wish to request a feature, please [open an issue](https://github.com/flame/blis/issues). + +For general discussion or questions, please join and post a message to the [blis-devel mailing list](http://groups.google.com/group/blis-devel). + +Thanks for your interest in BLIS! + diff --git a/examples/oapi/11gemm_md.c b/examples/oapi/11gemm_md.c new file mode 100644 index 000000000..8ae40c1f4 --- /dev/null +++ b/examples/oapi/11gemm_md.c @@ -0,0 +1,269 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +int main( int argc, char** argv ) +{ + num_t dt_r, dt_c; + num_t dt_s, dt_d; + num_t dt_a, dt_b; + dim_t m, n, k; + inc_t rs, cs; + + obj_t a, b, c; + obj_t* alpha; + obj_t* beta; + + // + // This file demonstrates mixing datatypes in gemm. + // + // NOTE: Please make sure that mixed datatype support is enabled in BLIS + // before proceeding to build and run the example binaries. If you're not + // sure whether mixed datatype support is enabled in BLIS, please refer + // to './configure --help' for the relevant options. + // + + // + // Example 1: Perform a general matrix-matrix multiply (gemm) operation + // with operands of different domains (but identical precisions). + // + + printf( "\n#\n# -- Example 1 --\n#\n\n" ); + + // Create some matrix operands to work with. + dt_r = BLIS_DOUBLE; + dt_c = BLIS_DCOMPLEX; + m = 4; n = 5; k = 1; rs = 0; cs = 0; + bli_obj_create( dt_c, m, n, rs, cs, &c ); + bli_obj_create( dt_r, m, k, rs, cs, &a ); + bli_obj_create( dt_c, k, n, rs, cs, &b ); + + // Set the scalars to use. + alpha = &BLIS_ONE; + beta = &BLIS_ONE; + + // Initialize the matrix operands. + bli_randm( &a ); + bli_randm( &b ); + bli_setm( &BLIS_ZERO, &c ); + + bli_printm( "a (double real): randomized", &a, "%4.1f", "" ); + bli_printm( "b (double complex): randomized", &b, "%4.1f", "" ); + bli_printm( "c (double complex): initial value", &c, "%4.1f", "" ); + + // c := beta * c + alpha * a * b, where 'a' is real, and 'b' and 'c' are + // complex. + bli_gemm( alpha, &a, &b, beta, &c ); + + bli_printm( "c (double complex): after gemm", &c, "%4.1f", "" ); + + // Free the objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + + // + // Example 2: Perform a general matrix-matrix multiply (gemm) operation + // with operands of different precisions (but identical domains). + // + + printf( "\n#\n# -- Example 2 --\n#\n\n" ); + + // Create some matrix operands to work with. + dt_s = BLIS_FLOAT; + dt_d = BLIS_DOUBLE; + m = 4; n = 5; k = 1; rs = 0; cs = 0; + bli_obj_create( dt_d, m, n, rs, cs, &c ); + bli_obj_create( dt_s, m, k, rs, cs, &a ); + bli_obj_create( dt_s, k, n, rs, cs, &b ); + + // Notice that we've chosen C to be double-precision real and A and B to be + // single-precision real. + + // Since we are mixing precisions, we will also need to specify the + // so-called "computation precision." That is, we need to signal to + // bli_gemm() whether we want the A*B product to be computed in single + // precision or double precision (prior to the result being accumulated + // back to C). To specify the computation precision, we need to set the + // corresponding bit in the C object. Here, we specify double-precision + // computation. + // NOTE: If you do not explicitly specify the computation precision, it + // will default to the storage precision of the C object. + bli_obj_set_comp_prec( BLIS_DOUBLE_PREC, &c ); + + // Initialize the matrix operands. + bli_randm( &a ); + bli_randm( &b ); + bli_setm( &BLIS_ZERO, &c ); + + bli_printm( "a (single real): randomized", &a, "%4.1f", "" ); + bli_printm( "b (single real): randomized", &b, "%4.1f", "" ); + bli_printm( "c (double real): initial value", &c, "%4.1f", "" ); + + // c := beta * c + alpha * a * b, where 'a' and 'b' are single-precision + // real, 'c' is double-precision real, and the matrix product is performed + // in double-precision arithmetic. + bli_gemm( alpha, &a, &b, beta, &c ); + + bli_printm( "c (double real): after gemm (exec prec = double precision)", &c, "%4.1f", "" ); + + // Free the objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + + // + // Example 3: Perform a general matrix-matrix multiply (gemm) operation + // with operands of different domains AND precisions. + // + + printf( "\n#\n# -- Example 3 --\n#\n\n" ); + + // Create some matrix operands to work with. + dt_a = BLIS_FLOAT; + dt_b = BLIS_DCOMPLEX; + dt_c = BLIS_SCOMPLEX; + m = 4; n = 5; k = 1; rs = 0; cs = 0; + bli_obj_create( dt_c, m, n, rs, cs, &c ); + bli_obj_create( dt_a, m, k, rs, cs, &a ); + bli_obj_create( dt_b, k, n, rs, cs, &b ); + + // Notice that we've chosen C to be single-precision complex, and A to be + // single-precision real, and B to be double-precision complex. + + // Set the computation precision to single precision this time. + bli_obj_set_comp_prec( BLIS_SINGLE_PREC, &c ); + + // Initialize the matrix operands. + bli_randm( &a ); + bli_randm( &b ); + bli_setm( &BLIS_ZERO, &c ); + + bli_printm( "a (single real): randomized", &a, "%4.1f", "" ); + bli_printm( "b (double complex): randomized", &b, "%4.1f", "" ); + bli_printm( "c (single complex): initial value", &c, "%4.1f", "" ); + + // c := beta * c + alpha * a * b, where 'a' is single-precision real, 'b' + // is double-precision complex, 'c' is single-precision complex, and the + // matrix product is performed in single-precision arithmetic. + bli_gemm( alpha, &a, &b, beta, &c ); + + bli_printm( "c (single complex): after gemm (exec prec = single precision)", &c, "%4.1f", "" ); + + // Free the objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + + // + // Example 4: Project objects between the real and complex domains. + // + + printf( "\n#\n# -- Example 4 --\n#\n\n" ); + + // Create some matrix operands to work with. + dt_r = BLIS_DOUBLE; + dt_c = BLIS_DCOMPLEX; + m = 4; n = 5; rs = 0; cs = 0; + bli_obj_create( dt_r, m, n, rs, cs, &a ); + bli_obj_create( dt_c, m, n, rs, cs, &b ); + + // Initialize a real matrix A. + bli_randm( &a ); + + bli_printm( "a (double real): randomized", &a, "%4.1f", "" ); + + // Project real matrix A to the complex domain (in B). + bli_projm( &a, &b ); + + bli_printm( "b (double complex): projected from 'a'", &b, "%4.1f", "" ); + + // Notice how the imaginary components in B are zero since any real + // matrix implicitly has imaginary values that are equal to zero. + + // Now let's project in the other direction. + + // Initialize the complex matrix B. + bli_randm( &b ); + + bli_printm( "b (double complex): randomized", &b, "%4.1f", "" ); + + // Project complex matrix B to the real domain (in A). + bli_projm( &b, &a ); + + bli_printm( "a (double real): projected from 'b'", &a, "%4.1f", "" ); + + // Notice how the imaginary components are lost in the projection from + // the complex domain to the real domain. + + // Free the objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + + // + // Example 5: Typecast objects between the single and double precisions. + // + + printf( "\n#\n# -- Example 5 --\n#\n\n" ); + + // Create some matrix operands to work with. + dt_s = BLIS_FLOAT; + dt_d = BLIS_DOUBLE; + m = 4; n = 3; rs = 0; cs = 0; + bli_obj_create( dt_d, m, n, rs, cs, &a ); + bli_obj_create( dt_s, m, n, rs, cs, &b ); + + // Initialize a double-precision real matrix A. + bli_randm( &a ); + + bli_printm( "a (double real): randomized", &a, "%23.16e", "" ); + + // Typecast A to single precision. + bli_castm( &a, &b ); + + bli_printm( "b (single real): typecast from 'a'", &b, "%23.16e", "" ); + + // Notice how the values in B are only accurate to the 6th or 7th decimal + // place relative to the true values in A. + + // Free the objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + + + return 0; +} + diff --git a/examples/oapi/Makefile b/examples/oapi/Makefile index a8373c448..7f622bf5e 100644 --- a/examples/oapi/Makefile +++ b/examples/oapi/Makefile @@ -127,7 +127,8 @@ TEST_BINS := 00obj_basic.x \ 07level1m_diag.x \ 08level2.x \ 09level3.x \ - 10util.x + 10util.x \ + 11gemm_md.x diff --git a/frame/0/bli_l0_fpa.c b/frame/0/bli_l0_fpa.c index 37c4a5dfb..75db984f9 100644 --- a/frame/0/bli_l0_fpa.c +++ b/frame/0/bli_l0_fpa.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname ) \ \ -GENARRAY_FPA( void*, opname ); \ +GENARRAY_FPA( PASTECH(opname,_vft), opname ); \ \ PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \ { \ @@ -63,7 +63,7 @@ GENFRONT( zipsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ -GENARRAY_FPA_I( void*, opname ); \ +GENARRAY_FPA_I( PASTECH(opname,_vft), opname ); \ \ PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \ { \ diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c index ca1e4657f..dc6c44aec 100644 --- a/frame/1d/bli_l1d_check.c +++ b/frame/1d/bli_l1d_check.c @@ -103,6 +103,22 @@ GENFRONT( setd ) GENFRONT( setid ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + ) \ +{ \ + bli_l1d_axy_check( beta, x, y ); \ +} + +GENFRONT( xpbyd ) + + // ----------------------------------------------------------------------------- void bli_l1d_xy_check diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h index cd015f919..df5ea1f17 100644 --- a/frame/1d/bli_l1d_check.h +++ b/frame/1d/bli_l1d_check.h @@ -90,6 +90,19 @@ GENTPROT( setd ) GENTPROT( setid ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + ); + +GENTPROT( xpbyd ) + + // ----------------------------------------------------------------------------- void bli_l1d_xy_check diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c index e244f27b8..6c57b1ab3 100644 --- a/frame/1d/bli_l1d_fpa.c +++ b/frame/1d/bli_l1d_fpa.c @@ -59,4 +59,5 @@ GENFRONT( invertd ) GENFRONT( scald ) GENFRONT( setd ) GENFRONT( setid ) +GENFRONT( xpbyd ) diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h index 9eb66cdd3..915c2eb33 100644 --- a/frame/1d/bli_l1d_fpa.h +++ b/frame/1d/bli_l1d_fpa.h @@ -51,3 +51,5 @@ GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) +GENPROT( xpbyd ) + diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h index debea1e62..5f6a487cf 100644 --- a/frame/1d/bli_l1d_ft.h +++ b/frame/1d/bli_l1d_ft.h @@ -131,3 +131,23 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEFR( setid ) +// xpbyd + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ); + +INSERT_GENTDEF( xpbyd ) + diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c index 9395b129b..a9a445e9b 100644 --- a/frame/1d/bli_l1d_oapi.c +++ b/frame/1d/bli_l1d_oapi.c @@ -312,5 +312,70 @@ void PASTEMAC(opname,EX_SUF) \ GENFRONT( setid ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + BLIS_OAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_OAPI_EX_DECLS \ +\ + num_t dt = bli_obj_dt( x ); \ +\ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ + trans_t transx = bli_obj_conjtrans_status( x ); \ + dim_t m = bli_obj_length( y ); \ + dim_t n = bli_obj_width( y ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t rs_y = bli_obj_row_stride( y ); \ + inc_t cs_y = bli_obj_col_stride( y ); \ +\ + void* buf_beta; \ +\ + obj_t beta_local; \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, beta, y ); \ +\ + /* Create local copy-casts of scalars (and apply internal conjugation + as needed). */ \ + bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ + beta, &beta_local ); \ + buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ +\ + /* Query a type-specific function pointer, except one that uses + void* instead of typed pointers. */ \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ +\ + f \ + ( \ + diagoffx, \ + diagx, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_beta, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +} + +GENFRONT( xpbyd ) + + #endif diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h index 319896ead..48eedfc63 100644 --- a/frame/1d/bli_l1d_oapi.h +++ b/frame/1d/bli_l1d_oapi.h @@ -93,3 +93,17 @@ GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) + +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + BLIS_OAPI_EX_PARAMS \ + ); + +GENTPROT( xpbyd ) + diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index b6a24a604..f20269291 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -387,5 +387,83 @@ void PASTEMAC2(ch,opname,EX_SUF) \ INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kername, kerid ) \ +\ +void PASTEMAC2(ch,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_TAPI_EX_DECLS \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* x1; \ + ctype* y1; \ + conj_t conjx; \ + dim_t n_elem; \ + dim_t offx, offy; \ + inc_t incx, incy; \ +\ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \ +\ + /* Determine the distance to the diagonals, the number of diagonal + elements, and the diagonal increments. */ \ + bli_set_dims_incs_2d \ + ( \ + diagoffx, transx, \ + m, n, rs_x, cs_x, rs_y, cs_y, \ + &offx, &offy, &n_elem, &incx, &incy \ + ); \ +\ + conjx = bli_extract_conj( transx ); \ +\ + if ( bli_is_nonunit_diag( diagx ) ) \ + { \ + x1 = x + offx; \ + y1 = y + offy; \ + } \ + else /* if ( bli_is_unit_diag( diagx ) ) */ \ + { \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ + } \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + /* Query the context for the operation's kernel address. */ \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ +\ + /* Invoke the kernel with the appropriate parameters. */ \ + f( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ +} + +INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER ) + + #endif diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h index 4314af5bc..5065768d1 100644 --- a/frame/1d/bli_l1d_tapi.h +++ b/frame/1d/bli_l1d_tapi.h @@ -125,3 +125,22 @@ void PASTEMAC2(ch,opname,EX_SUF) \ INSERT_GENTPROTR_BASIC0( setid ) + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC2(ch,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ); + +INSERT_GENTPROT_BASIC0( xpbyd ) + diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c index df811cae3..a6115e2a0 100644 --- a/frame/1m/bli_l1m_check.c +++ b/frame/1m/bli_l1m_check.c @@ -88,6 +88,22 @@ GENFRONT( scalm ) GENFRONT( setm ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + ) \ +{ \ + bli_l1m_axy_check( beta, x, y ); \ +} + +GENFRONT( xpbym ) + + // ----------------------------------------------------------------------------- void bli_l1m_xy_check diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h index 2e67e0674..90cf497b9 100644 --- a/frame/1m/bli_l1m_check.h +++ b/frame/1m/bli_l1m_check.h @@ -78,6 +78,19 @@ GENPROT( scalm ) GENPROT( setm ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + ); + +GENPROT( xpbym ) + + // ----------------------------------------------------------------------------- void bli_l1m_xy_check diff --git a/frame/1m/bli_l1m_fpa.c b/frame/1m/bli_l1m_fpa.c index 46b0d5c37..0f60cecf7 100644 --- a/frame/1m/bli_l1m_fpa.c +++ b/frame/1m/bli_l1m_fpa.c @@ -57,4 +57,23 @@ GENFRONT( axpym ) GENFRONT( scal2m ) GENFRONT( scalm ) GENFRONT( setm ) +GENFRONT( xpbym ) + +// +// Define function pointer query interfaces for two-datatype operations. +// + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +GENARRAY_FPA2( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ + PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ +\ +PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ +PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \ +{ \ + return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \ +} + +GENFRONT( xpbym_md ) diff --git a/frame/1m/bli_l1m_fpa.h b/frame/1m/bli_l1m_fpa.h index 3e07bf38d..076e2dec0 100644 --- a/frame/1m/bli_l1m_fpa.h +++ b/frame/1m/bli_l1m_fpa.h @@ -49,4 +49,13 @@ GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) +GENPROT( xpbym ) + +#undef GENPROT +#define GENPROT( opname ) \ +\ +PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ +PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); + +GENPROT( xpbym_md ) diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 9e7c9675e..593882ed0 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -141,3 +141,25 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) +// xpbym + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ); + +INSERT_GENTDEF( xpbym ) +INSERT_GENTDEF( xpbym_md ) + diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c index 190d7857a..f66de309c 100644 --- a/frame/1m/bli_l1m_oapi.c +++ b/frame/1m/bli_l1m_oapi.c @@ -302,5 +302,141 @@ void PASTEMAC(opname,EX_SUF) \ GENFRONT( setm ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + BLIS_OAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_OAPI_EX_DECLS \ +\ + if ( bli_obj_dt( x ) != bli_obj_dt( y ) ) \ + return bli_xpbym_md( x, beta, y ); \ +\ + num_t dt = bli_obj_dt( x ); \ +\ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ + trans_t transx = bli_obj_conjtrans_status( x ); \ + dim_t m = bli_obj_length( y ); \ + dim_t n = bli_obj_width( y ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t rs_y = bli_obj_row_stride( y ); \ + inc_t cs_y = bli_obj_col_stride( y ); \ +\ + void* buf_beta; \ +\ + obj_t beta_local; \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, beta, y ); \ +\ + /* Create local copy-casts of scalars (and apply internal conjugation + as needed). */ \ + bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ + beta, &beta_local ); \ + buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ +\ + /* Query a type-specific function pointer, except one that uses + void* instead of typed pointers. */ \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ +\ + f \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_beta, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +} + +GENFRONT( xpbym ) + + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + BLIS_OAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_OAPI_EX_DECLS \ +\ + num_t dtx = bli_obj_dt( x ); \ + num_t dty = bli_obj_dt( y ); \ +\ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ + trans_t transx = bli_obj_conjtrans_status( x ); \ + dim_t m = bli_obj_length( y ); \ + dim_t n = bli_obj_width( y ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t rs_y = bli_obj_row_stride( y ); \ + inc_t cs_y = bli_obj_col_stride( y ); \ +\ + void* buf_beta; \ +\ + obj_t beta_local; \ +\ + /* Create local copy-casts of scalars (and apply internal conjugation + as needed). */ \ + bli_obj_scalar_init_detached_copy_of( dty, BLIS_NO_CONJUGATE, \ + beta, &beta_local ); \ + buf_beta = bli_obj_buffer_for_1x1( dty, &beta_local ); \ +\ + /* Query a (multi) type-specific function pointer, except one that uses + void* instead of typed pointers. */ \ + PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ + PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \ +\ + f \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + buf_x, rs_x, cs_x, \ + buf_beta, \ + buf_y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +} + +GENFRONT( xpbym_md ) + + + #endif diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h index 2f72f1167..4a42ab00d 100644 --- a/frame/1m/bli_l1m_oapi.h +++ b/frame/1m/bli_l1m_oapi.h @@ -80,3 +80,18 @@ void PASTEMAC(opname,EX_SUF) \ GENPROT( scalm ) GENPROT( setm ) + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y \ + BLIS_OAPI_EX_PARAMS \ + ); + +GENPROT( xpbym ) +GENPROT( xpbym_md ) + diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index d852c4794..fb7173c47 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -382,5 +382,155 @@ INSERT_GENTFUNC_BASIC0( scalm ) INSERT_GENTFUNC_BASIC0( setm ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC2(ch,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_TAPI_EX_DECLS \ +\ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + /* If beta is zero, then the operation reduces to copym. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC2(ch,copym,_unb_var1) \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +\ + return; \ + } \ +\ + /* Invoke the helper variant, which loops over the appropriate kernel + to implement the current operation. */ \ + PASTEMAC2(ch,opname,_unb_var1) \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + beta, \ + y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +\ + /* When the diagonal of an upper- or lower-stored matrix is unit, + we handle it with a separate post-processing step. */ \ + if ( bli_is_upper_or_lower( uplox ) && \ + bli_is_unit_diag( diagx ) ) \ + { \ + PASTEMAC2(ch,xpbyd,BLIS_TAPI_EX_SUF) \ + ( \ + diagoffx, \ + diagx, \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + beta, \ + y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ + } \ +} + +INSERT_GENTFUNC_BASIC0( xpbym ) + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC3(chx,chy,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype_x* x, inc_t rs_x, inc_t cs_x, \ + ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ) \ +{ \ + bli_init_once(); \ +\ + BLIS_TAPI_EX_DECLS \ +\ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + /* If beta is zero, then the operation reduces to copym. */ \ + if ( PASTEMAC(chy,eq0)( *beta ) ) \ + { \ + PASTEMAC2(chx,chy,castm) \ + ( \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + y, rs_y, cs_y \ + ); \ +\ + return; \ + } \ +\ + /* Invoke the helper variant, which loops over the appropriate kernel + to implement the current operation. */ \ + PASTEMAC3(chx,chy,opname,_unb_var1) \ + ( \ + diagoffx, \ + diagx, \ + uplox, \ + transx, \ + m, \ + n, \ + x, rs_x, cs_x, \ + beta, \ + y, rs_y, cs_y, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC2_BASIC0( xpbym_md ) +INSERT_GENTFUNC2_MIXDP0( xpbym_md ) + + #endif diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h index cacf93394..ccd2f77a9 100644 --- a/frame/1m/bli_l1m_tapi.h +++ b/frame/1m/bli_l1m_tapi.h @@ -98,3 +98,44 @@ void PASTEMAC2(ch,opname,EX_SUF) \ INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC2(ch,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ); + +INSERT_GENTPROT_BASIC0( xpbym ) + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC3(chx,chy,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype_x* x, inc_t rs_x, inc_t cs_x, \ + ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y \ + BLIS_TAPI_EX_PARAMS \ + ); + +INSERT_GENTPROT2_BASIC0( xpbym_md ) +INSERT_GENTPROT2_MIXDP0( xpbym_md ) + diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index 2e80a5998..3d08db661 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -378,3 +378,252 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC2( setm_unb_var1, setv, BLIS_SETV_KER ) + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kername, kerid ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* x1; \ + ctype* y1; \ + uplo_t uplox_eff; \ + conj_t conjx; \ + dim_t n_iter; \ + dim_t n_elem, n_elem_max; \ + inc_t ldx, incx; \ + inc_t ldy, incy; \ + dim_t j, i; \ + dim_t ij0, n_shift; \ +\ + /* Set various loop parameters. */ \ + bli_set_dims_incs_uplo_2m \ + ( \ + diagoffx, diagx, transx, \ + uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ + &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ + &ij0, &n_shift \ + ); \ +\ + if ( bli_is_zeros( uplox_eff ) ) return; \ +\ + /* Extract the conjugation component from the transx parameter. */ \ + conjx = bli_extract_conj( transx ); \ +\ + /* Query the kernel needed for this operation. */ \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ +\ + /* Handle dense and upper/lower storage cases separately. */ \ + if ( bli_is_dense( uplox_eff ) ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + n_elem = n_elem_max; \ +\ + x1 = x + (j )*ldx + (0 )*incx; \ + y1 = y + (j )*ldy + (0 )*incy; \ +\ + /* Invoke the kernel with the appropriate parameters. */ \ + f( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ + } \ + } \ + else \ + { \ + if ( bli_is_upper( uplox_eff ) ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ +\ + x1 = x + (ij0+j )*ldx + (0 )*incx; \ + y1 = y + (ij0+j )*ldy + (0 )*incy; \ +\ + /* Invoke the kernel with the appropriate parameters. */ \ + f( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ + } \ + } \ + else if ( bli_is_lower( uplox_eff ) ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ + n_elem = n_elem_max - i; \ +\ + x1 = x + (j )*ldx + (ij0+i )*incx; \ + y1 = y + (j )*ldy + (ij0+i )*incy; \ +\ + /* Invoke the kernel with the appropriate parameters. */ \ + f( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( xpbym_unb_var1, xpbyv, BLIS_XPBYV_KER ) + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC2(chx,chy,opname) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype_x* x, inc_t rs_x, inc_t cs_x, \ + ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + ctype_x* restrict x1; \ + ctype_y* restrict y1; \ + uplo_t uplox_eff; \ + dim_t n_iter; \ + dim_t n_elem, n_elem_max; \ + inc_t ldx, incx; \ + inc_t ldy, incy; \ + dim_t j, i; \ + dim_t ij0, n_shift; \ +\ + /* Set various loop parameters. */ \ + bli_set_dims_incs_uplo_2m \ + ( \ + diagoffx, diagx, transx, \ + uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ + &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ + &ij0, &n_shift \ + ); \ +\ + /* Extract the conjugation component from the transx parameter. */ \ + /*conjx = bli_extract_conj( transx );*/ \ +\ + /* Handle dense and upper/lower storage cases separately. */ \ + if ( PASTEMAC(chy,eq1)( *beta ) ) \ + { \ + if ( incx == 1 && incy == 1 ) \ + { \ + n_elem = n_elem_max; \ +\ + for ( j = 0; j < n_iter; ++j ) \ + { \ + x1 = x + (j )*ldx + (0 )*incx; \ + y1 = y + (j )*ldy + (0 )*incy; \ +\ + ctype_x* restrict chi1 = x1; \ + ctype_y* restrict psi1 = y1; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(chx,chy,adds)( chi1[i], psi1[i] ); \ + } \ + } \ + } \ + else \ + { \ + n_elem = n_elem_max; \ +\ + for ( j = 0; j < n_iter; ++j ) \ + { \ + x1 = x + (j )*ldx + (0 )*incx; \ + y1 = y + (j )*ldy + (0 )*incy; \ +\ + ctype_x* restrict chi1 = x1; \ + ctype_y* restrict psi1 = y1; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \ +\ + chi1 += incx; \ + psi1 += incy; \ + } \ + } \ + } \ + } \ + else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \ + { \ + if ( incx == 1 && incy == 1 ) \ + { \ + n_elem = n_elem_max; \ +\ + for ( j = 0; j < n_iter; ++j ) \ + { \ + x1 = x + (j )*ldx + (0 )*incx; \ + y1 = y + (j )*ldy + (0 )*incy; \ +\ + ctype_x* restrict chi1 = x1; \ + ctype_y* restrict psi1 = y1; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC3(chx,chy,chy,xpbys)( chi1[i], *beta, psi1[i] ); \ + } \ + } \ + } \ + else \ + { \ + n_elem = n_elem_max; \ +\ + for ( j = 0; j < n_iter; ++j ) \ + { \ + x1 = x + (j )*ldx + (0 )*incx; \ + y1 = y + (j )*ldy + (0 )*incy; \ +\ + ctype_x* restrict chi1 = x1; \ + ctype_y* restrict psi1 = y1; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \ +\ + chi1 += incx; \ + psi1 += incy; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 ) +INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 ) + diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h index b42e1035a..f6014f6b4 100644 --- a/frame/1m/bli_l1m_unb_var1.h +++ b/frame/1m/bli_l1m_unb_var1.h @@ -101,3 +101,46 @@ void PASTEMAC2(ch,opname,_unb_var1) \ INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC2(ch,opname,_unb_var1) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ); + +INSERT_GENTPROT_BASIC0( xpbym ) + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC3(chx,chy,opname,_unb_var1) \ + ( \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + ctype_x* x, inc_t rs_x, inc_t cs_x, \ + ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ); + +INSERT_GENTPROT2_BASIC0( xpbym_md ) +INSERT_GENTPROT2_MIXDP0( xpbym_md ) + diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index a336cf9f2..6c88ea893 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -55,3 +55,8 @@ #include "bli_packm_cxk_rih.h" #include "bli_packm_cxk_1er.h" +// Mixed datatype support. +#ifdef BLIS_ENABLE_GEMM_MD +#include "bli_packm_md.h" +#endif + diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 383462726..195315886 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -108,7 +108,17 @@ void bli_packm_blk_var1 thrinfo_t* t ) { - num_t dt_cp = bli_obj_dt( c ); +#ifdef BLIS_ENABLE_GEMM_MD + // Call a different packm implementation when the storage and target + // datatypes differ. + if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) ) + { + bli_packm_blk_var1_md( c, p, cntx, cntl, t ); + return; + } +#endif + + num_t dt_c = bli_obj_dt( c ); struc_t strucc = bli_obj_struc( c ); doff_t diagoffc = bli_obj_diag_offset( c ); @@ -155,7 +165,7 @@ void bli_packm_blk_var1 // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); + buf_kappa = bli_obj_buffer_for_const( dt_c, &BLIS_ONE ); } else // if ( bli_is_ind_packed( schema ) ) { @@ -187,11 +197,10 @@ void bli_packm_blk_var1 } // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p ); + buf_kappa = bli_obj_buffer_for_1x1( dt_c, kappa_p ); } - // Choose the correct func_t object based on the pack_t schema. #if 0 if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; else if ( bli_is_3mi_packed( schema ) || @@ -208,7 +217,7 @@ void bli_packm_blk_var1 //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); - //if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) ) + //if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) ) { // If the packm structure-aware kernel func_t in the context is // NULL (which is the default value after the context is created), @@ -230,11 +239,11 @@ void bli_packm_blk_var1 #endif // Query the datatype-specific function pointer from the func_t object. - packm_ker = bli_func_get_dt( dt_cp, packm_kers ); + packm_ker = bli_func_get_dt( dt_c, packm_kers ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_cp]; + f = ftypes[dt_c]; // Invoke the function. f( strucc, @@ -433,10 +442,10 @@ void PASTEMAC(ch,varname) \ \ /* if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", m, n, \ c_cast, rs_c, cs_c, "%4.1f", "" ); \ if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", m, n, \ c_cast, rs_c, cs_c, "%4.1f", "" ); \ */ \ \ @@ -605,6 +614,15 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ } \ \ /* +if ( row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ +/* if ( col_stored ) { \ if ( bli_thread_work_id( thread ) == 0 ) \ { \ diff --git a/frame/1m/packm/bli_packm_blk_var1.c.old b/frame/1m/packm/bli_packm_blk_var1.c.old deleted file mode 100644 index 4b18302f4..000000000 --- a/frame/1m/packm/bli_packm_blk_var1.c.old +++ /dev/null @@ -1,463 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool_t invdiag, - bool_t revifup, - bool_t reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - void* packm_ker, - packm_thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); - -extern func_t* packm_struc_cxk_kers; - - -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - packm_thrinfo_t* t ) -{ - num_t dt_cp = bli_obj_dt( c ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool_t invdiag = bli_obj_has_inverted_diag( p ); - bool_t revifup = bli_obj_is_pack_rev_if_upper( p ); - bool_t reviflo = bli_obj_is_pack_rev_if_lower( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); - - void* buf_kappa; - - func_t* packm_kers; - void* packm_ker; - - FUNCPTR_T f; - - // This variant assumes that the micro-kernel will always apply the - // alpha scalar of the higher-level operation. Thus, we use BLIS_ONE - // for kappa so that the underlying packm implementation does not - // scale during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); - - // Choose the correct func_t object. - packm_kers = packm_struc_cxk_kers; - - // Query the datatype-specific function pointer from the func_t object. - packm_ker = bli_func_obj_query( dt_cp, packm_kers ); - - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_cp]; - - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - schema, - invdiag, - revifup, - reviflo, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - packm_ker, - t ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kertype ) \ -\ -void PASTEMAC(ch,varname) \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void* packm_ker, \ - packm_thrinfo_t* thread \ - ) \ -{ \ - PASTECH(ch,kertype) packm_ker_cast = packm_ker; \ -\ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t num_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - dim_t panel_off_i; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool_t row_stored; \ - bool_t col_stored; \ -\ - ctype* restrict c_use; \ - ctype* restrict p_use; \ - doff_t diagoffp_i; \ -\ -\ - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ \ - if ( bli_is_zeros( uploc ) && \ - bli_is_triangular( strucc ) ) return; \ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_uplo( &uploc ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t )panel_dim_max; \ - ldp = rs_p; \ - m_panel_full = &m; \ - n_panel_full = &panel_dim_i; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim_max; \ - ldp = cs_p; \ - m_panel_full = &panel_dim_i; \ - n_panel_full = &n; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ \ - if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ - ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ - { \ - ic0 = (num_iter - 1) * panel_dim_max; \ - ic_inc = -panel_dim_max; \ - ip0 = num_iter - 1; \ - ip_inc = -1; \ - } \ - else \ - { \ - ic0 = 0; \ - ic_inc = panel_dim_max; \ - ip0 = 0; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ -\ - if ( bli_is_triangular( strucc ) && \ - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is completely unstored (ie: zero). If the panel - is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ \ -\ - continue; \ - } \ - else if ( bli_is_triangular( strucc ) && \ - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is diagonal-intersecting. Notice that we - cannot bury the following conditional logic into - packm_struc_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc_i < 0 ) || \ - ( row_stored && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ - panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \ - panel_len_max ); \ - diagoffp_i = diagoffc_i; \ - } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ - { \ - panel_off_i = bli_abs( diagoffc_i ); \ - panel_len_i = panel_len_full - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp_i = 0; \ - } \ -\ - c_use = c_begin + (panel_off_i )*ldc; \ - p_use = p_begin; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p ); \ - } \ -\ - /* NOTE: This value is usually LESS than ps_p because triangular - matrices usually have several micro-panels that are shorter - than a "full" micro-panel. */ \ - p_inc = ldp * panel_len_max_i; \ -\ - /* We nudge the panel increment up by one if it is odd. */ \ - p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* This case executes if the panel belongs to a Hermitian or - symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffc_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p, \ - is_p ); \ - } \ -\ - /* NOTE: This value is equivalent to ps_p. */ \ - /*p_inc = ldp * panel_len_max_i;*/ \ - p_inc = ps_p; \ - } \ - else \ - { \ - /* This case executes if the panel is general, or, if the - panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - packm_ker_cast( BLIS_GENERAL, \ - 0, \ - diagc, \ - BLIS_DENSE, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p, \ - is_p ); \ - } \ -/* - if ( row_stored ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \ - p_begin, rs_p, cs_p, "%9.2e", "" ); \ - else if ( col_stored ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \ - p_begin, rs_p, cs_p, "%9.2e", "" ); \ -*/ \ -\ - /* NOTE: This value is equivalent to ps_p. */ \ - /*p_inc = ldp * panel_len_max_i;*/ \ - p_inc = ps_p; \ - } \ -\ -\ - p_begin += p_inc; \ - } \ -} - -INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t ) - diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c new file mode 100644 index 000000000..4efd0074c --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var1_md.c @@ -0,0 +1,293 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_GEMM_MD + +#define FUNCPTR_T packm_fp + +typedef void (*FUNCPTR_T)( + trans_t transc, + pack_t schema, + dim_t m, + dim_t n, + dim_t m_max, + dim_t n_max, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + inc_t is_p, + dim_t pd_p, inc_t ps_p, + cntx_t* cntx, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md); + + +void bli_packm_blk_var1_md + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ) +{ + num_t dt_c = bli_obj_dt( c ); + num_t dt_p = bli_obj_dt( p ); + + trans_t transc = bli_obj_conjtrans_status( c ); + pack_t schema = bli_obj_pack_schema( p ); + + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_max_p = bli_obj_padded_length( p ); + dim_t n_max_p = bli_obj_padded_width( p ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + void* buf_p = bli_obj_buffer_at_off( p ); + inc_t rs_p = bli_obj_row_stride( p ); + inc_t cs_p = bli_obj_col_stride( p ); + inc_t is_p = bli_obj_imag_stride( p ); + dim_t pd_p = bli_obj_panel_dim( p ); + inc_t ps_p = bli_obj_panel_stride( p ); + + void* buf_kappa; + + FUNCPTR_T f; + + + // Unconditionally use kappa = 1.0. Thus, we don't support scaling + // during packing when mixing datatypes. + buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_c][dt_p]; + + // Invoke the function. + f( + transc, + schema, + m_p, + n_p, + m_max_p, + n_max_p, + buf_kappa, + buf_c, rs_c, cs_c, + buf_p, rs_p, cs_p, + is_p, + pd_p, ps_p, + cntx, + t ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ +\ +void PASTEMAC2(chc,chp,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ + ) \ +{ \ + ctype_p* restrict kappa_cast = kappa; \ + ctype_c* restrict c_cast = c; \ + ctype_p* restrict p_cast = p; \ + ctype_c* restrict c_begin; \ + ctype_p* restrict p_begin; \ +\ + dim_t iter_dim; \ + dim_t num_iter; \ + dim_t it, ic, ip; \ + doff_t ic_inc, ip_inc; \ + dim_t panel_len_full; \ + dim_t panel_len_i; \ + dim_t panel_len_max; \ + dim_t panel_len_max_i; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + inc_t vs_c; \ + inc_t p_inc; \ + dim_t* m_panel_use; \ + dim_t* n_panel_use; \ + dim_t* m_panel_max; \ + dim_t* n_panel_max; \ + conj_t conjc; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ + ctype_c* restrict c_use; \ + ctype_p* restrict p_use; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* If c needs a transposition, induce it so that we can more simply + express the remaining parameters and code. */ \ + if ( bli_does_trans( transc ) ) \ + { \ + bli_swap_incs( &rs_c, &cs_c ); \ + bli_toggle_trans( &transc ); \ + } \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ + ( void )col_stored; \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len_full = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + vs_c = cs_c; \ + m_panel_use = &panel_len_i; \ + n_panel_use = &panel_dim_i; \ + m_panel_max = &panel_len_max_i; \ + n_panel_max = &panel_dim_max; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len_full = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + vs_c = rs_c; \ + m_panel_use = &panel_dim_i; \ + n_panel_use = &panel_len_i; \ + m_panel_max = &panel_dim_max; \ + n_panel_max = &panel_len_max_i; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + { \ + ic_inc = panel_dim_max; \ + ip_inc = 1; \ + } \ +\ + p_begin = p_cast; \ +\ +/* +if ( row_stored ) \ +PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: b orig", m, n, \ + c_cast, rs_c, cs_c, "%5.2f", "" ); \ +if ( col_stored ) \ +PASTEMAC(chc,fprintm)( stdout, "packm_blk_var1_md: a orig", m, n, \ + c_cast, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +\ + for ( ic = 0, ip = 0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ + { \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + c_begin = c_cast + (ic )*vs_c; \ +\ + { \ + c_use = c_begin; \ + p_use = p_begin; \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + if( packm_thread_my_iter( it, thread ) ) \ + { \ + PASTEMAC2(chc,chp,packm_struc_cxk_md) \ + ( \ + conjc, \ + schema, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p, \ + is_p, \ + cntx \ + ); \ + } \ +\ + p_inc = ps_p; \ + } \ +\ +/* +if ( row_stored ) \ +PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += p_inc; \ +\ + } \ +} + +INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md ) +INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md ) + +#endif diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h new file mode 100644 index 000000000..519749143 --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var1_md.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_packm_blk_var1_md + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ); + + +#undef GENTPROT2 +#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ +\ +void PASTEMAC2(chc,chp,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ + ); + +INSERT_GENTPROT2_BASIC0( packm_blk_var1_md ) +INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md ) + diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 0afd06e27..4ecef849f 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -211,13 +211,14 @@ siz_t bli_packm_init_pack bli_init_once(); num_t dt = bli_obj_dt( a ); + num_t dt_tar = bli_obj_target_dt( a ); trans_t transa = bli_obj_onlytrans_status( a ); dim_t m_a = bli_obj_length( a ); dim_t n_a = bli_obj_width( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); dim_t m_p, n_p; dim_t m_p_pad, n_p_pad; @@ -230,6 +231,17 @@ siz_t bli_packm_init_pack // We begin by copying the fields of A. bli_obj_alias_to( a, p ); + // Typecast the internal scalar value to the target datatype. + // NOTE: This must happen BEFORE we change the datatype of P to reflect + // the target_dt. + if ( dt != dt_tar ) + { + bli_obj_scalar_cast_to( dt_tar, p ); + } + + // Update the datatype of P to be the target datatype of A. + bli_obj_set_dt( dt_tar, p ); + // Update the dimension fields to explicitly reflect a transposition, // if needed. // Then, clear the conjugation and transposition fields from the object diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_md.h new file mode 100644 index 000000000..7620a572e --- /dev/null +++ b/frame/1m/packm/bli_packm_md.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_blk_var1_md.h" +#include "bli_packm_struc_cxk_md.h" + diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c new file mode 100644 index 000000000..33d720f30 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -0,0 +1,294 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_GEMM_MD + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ +\ +void PASTEMAC2(chc,chp,varname) \ + ( \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype_p* restrict kappa, \ + ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + if ( bli_is_nat_packed( schema ) ) \ + { \ + trans_t transc = ( trans_t )conjc; \ +\ + /* NOTE: We ignore kappa for now, since it should be 1.0. */ \ + PASTEMAC2(chc,chp,castm) \ + ( \ + transc, \ + panel_dim, \ + panel_len, \ + c, incc, ldc, \ + p, 1, ldp \ + ); \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype_p* restrict zero = PASTEMAC(chp,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_p* p_edge = p + (i )*rs_p; \ +\ + PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p, \ + cntx, \ + NULL \ + ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_p* restrict zero = PASTEMAC(chp,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_p* p_edge = p + (j )*cs_p; \ +\ + PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p, \ + cntx, \ + NULL \ + ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* NOTE: We ignore kappa for now, since it should be 1.0. */ \ + PASTEMAC2(chc,chp,packm_cxk_1r_md) \ + ( \ + conjc, \ + panel_dim, \ + panel_len, \ + c, incc, ldc, \ + p, ldp \ + ); \ +\ + if ( m_panel != m_panel_max ) \ + { \ + ctype_p* restrict zero = PASTEMAC(chp,0); \ + dim_t offm = m_panel; \ + dim_t offn = 0; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max; \ +\ + ( void ) zero; \ + ( void ) m_edge; ( void )offm; \ + ( void ) n_edge; ( void )offn; \ +\ + PASTEMAC(chp,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_p* restrict zero = PASTEMAC(chp,0); \ + dim_t offm = 0; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + ( void ) zero; \ + ( void ) m_edge; ( void )offm; \ + ( void ) n_edge; ( void )offn; \ +\ + PASTEMAC(chp,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ + } \ +\ +\ +/* + if ( bli_is_col_packed( schema ) ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \ + p, rs_p, cs_p, "%4.1f", "" ); \ + else if ( bli_is_row_packed( schema ) ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \ + p, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC2_BASIC0( packm_struc_cxk_md ) +INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md ) + + +// ----------------------------------------------------------------------------- + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \ +\ +void PASTEMAC2(cha,chp,opname) \ + ( \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_p* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + PASTEMAC(cha,ctyper)* restrict alpha1_r = ( PASTEMAC(cha,ctyper)* )a; \ + PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \ + PASTEMAC(chp,ctyper)* restrict pi1_r = ( PASTEMAC(chp,ctyper)* )p; \ + PASTEMAC(chp,ctyper)* restrict pi1_i = ( PASTEMAC(chp,ctyper)* )p + ldp; \ +\ + dim_t i; \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + PASTEMAC2(cha,chp,copyjris)( *(alpha1_r + i*inca2), \ + *(alpha1_i + i*inca2), \ + *(pi1_r + i*1), \ + *(pi1_i + i*1) ); \ + } \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + PASTEMAC2(cha,chp,copyris)( *(alpha1_r + i*inca2), \ + *(alpha1_i + i*inca2), \ + *(pi1_r + i*1), \ + *(pi1_i + i*1) ); \ + } \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ +} + +INSERT_GENTFUNC2_BASIC0( packm_cxk_1r_md ) +INSERT_GENTFUNC2_MIXDP0( packm_cxk_1r_md ) + +#endif diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h new file mode 100644 index 000000000..a0c56401e --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_md.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT2 +#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ +\ +void PASTEMAC2(chc,chp,varname) \ + ( \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype_p* restrict kappa, \ + ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) +INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) + + +#undef GENTPROT2 +#define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ +\ +void PASTEMAC2(cha,chp,opname) \ + ( \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_p* restrict p, inc_t ldp \ + ); + +INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) +INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) + diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index d1bd9dec2..52895f121 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -295,13 +295,33 @@ void bli_gemm_basic_check e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); +#ifdef BLIS_ENABLE_GEMM_MD + // Skip checking for consistent datatypes between A, B, and C since + // that is totally valid for mixed-datatype gemm. + + // When mixing datatypes, make sure that alpha does not have a non-zero + // imaginary component. + if ( bli_obj_dt( c ) != bli_obj_dt( a ) || + bli_obj_dt( c ) != bli_obj_dt( b ) || + bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) + if ( !bli_obj_imag_is_zero( alpha ) ) + { + bli_print_msg( "Mixed-datatype gemm does not yet support alpha with a non-zero imaginary component. Please contact BLIS developers for further support.", __FILE__, __LINE__ ); + bli_abort(); + } + +#else // BLIS_DISABLE_GEMM_MD + // Check for consistent datatypes. + // NOTE: We only perform these tests when mixed datatype support is + // disabled. e_val = bli_check_consistent_object_datatypes( c, a ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, b ); bli_check_error_code( e_val ); +#endif } void bli_hemm_basic_check diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 7cfcd0f94..701428a59 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -58,11 +58,13 @@ void PASTEMAC(opname,EX_SUF) \ BLIS_OAPI_EX_DECLS \ \ /* Only proceed with an induced method if all operands have the same - (complex) datatype. If any datatypes differ, skip the induced method - chooser function and proceed directly with native execution, which is + (complex) datatype, and if that datatype matches the execution + datatype. If any datatypes differ, skip the induced method chooser + function and proceed directly with native execution, which is where mixed datatype support will be implemented (if at all). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ + if ( bli_obj_dt( c ) == bli_obj_dt( a ) && \ + bli_obj_dt( c ) == bli_obj_dt( b ) && \ + bli_obj_dt( c ) == bli_obj_exec_dt( c ) && \ bli_obj_is_complex( c ) ) \ { \ /* Invoke the operation's "ind" function--its induced method front-end. diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index 5349e9750..987ba93bd 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -38,3 +38,7 @@ #include "bli_gemm_var.h" +// Mixed datatype support. +#ifdef BLIS_ENABLE_GEMM_MD +#include "bli_gemm_md.h" +#endif diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 8518d6102..62d1ae502 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -54,8 +54,13 @@ void bli_gemm_front obj_t c_local; #ifdef BLIS_ENABLE_SMALL_MATRIX - gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl ); - if ( status == BLIS_SUCCESS ) return; + // Only handle small problems separately for homogeneous datatypes. + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_dt( a ) == bli_obj_dt( c ) ) + { + gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl ); + if ( status == BLIS_SUCCESS ) return; + } #endif // Check parameters. @@ -74,38 +79,33 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_swap( &a_local, &b_local ); +#ifdef BLIS_ENABLE_GEMM_MD + cntx_t cntx_local; - bli_obj_induce_trans( &a_local ); - bli_obj_induce_trans( &b_local ); - bli_obj_induce_trans( &c_local ); + // If any of the storage datatypes differ, or if the computation precision + // differs from the storage precision of C, utilize the mixed datatype + // code path. + // NOTE: If we ever want to support the caller setting the computation + // domain explicitly, we will need to check the computation dt against the + // storage dt of C (instead of the computation precision against the + // storage precision of C). + if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) || + bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) || + bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) ) + { + // Handle mixed datatype cases in bli_gemm_md(), which may modify + // the objects or the context. (If the context is modified, cntx + // is adjusted to point to cntx_local.) + bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx ); } - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_GEMM, - BLIS_LEFT, // ignored for gemm/hemm/symm - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - + else // homogeneous datatypes +#endif { - // A sort of hack for communicating the desired pach schemas for A and B - // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // A sort of hack for communicating the desired pach schemas for A and + // B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and // bli_l3_cntl_create_if()). This allows us to access the schemas from - // the control tree, which hopefully reduces some confusion, particularly - // in bli_packm_init(). + // the control tree, which hopefully reduces some confusion, + // particularly in bli_packm_init(). if ( bli_cntx_method( cntx ) == BLIS_NAT ) { bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); @@ -121,6 +121,129 @@ void bli_gemm_front } } +#ifdef BLIS_ENABLE_GEMM_MD + // Don't perform the following optimization for ccr or crc cases, as + // those cases are sensitive to the ukernel storage preference (ie: + // transposing the operation would break them). + if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && + !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) ) +#endif + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + + // We must also swap the pack schemas, which were set by bli_gemm_md() + // or the inlined code above. + bli_obj_swap_pack_schemas( &a_local, &b_local ); + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + obj_t* cp = &c_local; + obj_t* betap = beta; + +#ifdef BLIS_ENABLE_GEMM_MD +#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + // If any of the following conditions are met, create a temporary matrix + // conformal to C into which we will accumulate the matrix product: + // - the storage precision of C differs from the computation precision; + // - the domains are mixed as crr; + // - the storage format of C does not match the preferred orientation + // of the ccr or crc cases. + // Then, after the computation is complete, this matrix will be copied + // or accumulated back to C. + const bool_t is_ccr_mismatch = + ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && + !bli_obj_is_col_stored( &c_local ) ); + const bool_t is_crc_mismatch = + ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) && + !bli_obj_is_row_stored( &c_local ) ); + + obj_t ct; + bool_t use_ct = FALSE; + + // FGVZ: Consider adding another guard here that only creates and uses a + // temporary matrix for accumulation if k < c * kc, where c is some small + // constant like 2. And don't forget to use the same conditional for the + // castm() and free() at the end. + if ( + bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) || + bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) || + is_ccr_mismatch || + is_crc_mismatch + ) + { + use_ct = TRUE; + } + + // If we need a temporary matrix conformal to C for whatever reason, + // we create it and prepare to use it now. + if ( use_ct ) + { + const dim_t m = bli_obj_length( &c_local ); + const dim_t n = bli_obj_width( &c_local ); + inc_t rs = bli_obj_row_stride( &c_local ); + inc_t cs = bli_obj_col_stride( &c_local ); + + num_t dt_ct = bli_obj_domain( &c_local ) | + bli_obj_comp_prec( &c_local ); + + // When performing the crr case, accumulate to a contiguously-stored + // real matrix so we do not have to repeatedly update C with general + // stride. + if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ) + dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local ); + + // When performing the mismatched ccr or crc cases, now is the time + // to specify the appropriate storage so the gemm_md_c2r_ref() virtual + // microkernel can output directly to C (instead of using a temporary + // microtile). + if ( is_ccr_mismatch ) { rs = 1; cs = m; } + else if ( is_crc_mismatch ) { rs = n; cs = 1; } + + bli_obj_create( dt_ct, m, n, rs, cs, &ct ); + + const num_t dt_exec = bli_obj_exec_dt( &c_local ); + const num_t dt_comp = bli_obj_comp_dt( &c_local ); + + bli_obj_set_target_dt( dt_ct, &ct ); + bli_obj_set_exec_dt( dt_exec, &ct ); + bli_obj_set_comp_dt( dt_comp, &ct ); + + // A naive approach would cast C to the comptuation datatype, + // compute with beta, and then cast the result back to the + // user-provided output matrix. However, we employ a different + // approach that halves the number of memops on C (or its + // typecast temporary) by writing the A*B product directly to + // temporary storage, and then using xpbym to scale the + // output matrix by beta and accumulate/cast the A*B product. + //bli_castm( &c_local, &ct ); + betap = &BLIS_ZERO; + + cp = &ct; + } +#endif +#endif + // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( @@ -129,11 +252,112 @@ void bli_gemm_front alpha, &a_local, &b_local, - beta, - &c_local, + betap, + cp, cntx, rntm, cntl ); + +#ifdef BLIS_ENABLE_GEMM_MD +#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + // If we created a temporary matrix conformal to C for whatever reason, + // we copy/accumulate the result back to C and then release the object. + if ( use_ct ) + { + //bli_castnzm( &ct, &c_local ); + bli_xpbym( &ct, beta, &c_local ); + + bli_obj_free( &ct ); + } +#endif +#endif } +// ----------------------------------------------------------------------------- + +#if 0 + if ( bli_obj_dt( a ) != bli_obj_dt( b ) || + bli_obj_dt( a ) != bli_obj_dt( c ) || + bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) + { + const bool_t a_is_real = bli_obj_is_real( a ); + const bool_t a_is_comp = bli_obj_is_complex( a ); + const bool_t b_is_real = bli_obj_is_real( b ); + const bool_t b_is_comp = bli_obj_is_complex( b ); + const bool_t c_is_real = bli_obj_is_real( c ); + const bool_t c_is_comp = bli_obj_is_complex( c ); + + const bool_t a_is_single = bli_obj_is_single_prec( a ); + const bool_t a_is_double = bli_obj_is_double_prec( a ); + const bool_t b_is_single = bli_obj_is_single_prec( b ); + const bool_t b_is_double = bli_obj_is_double_prec( b ); + const bool_t c_is_single = bli_obj_is_single_prec( c ); + const bool_t c_is_double = bli_obj_is_double_prec( c ); + + const bool_t comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC; + const bool_t comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC; + + const bool_t mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) || + bli_obj_domain( c ) != bli_obj_domain( b ); + + ( void )a_is_real; ( void )a_is_comp; + ( void )b_is_real; ( void )b_is_comp; + ( void )c_is_real; ( void )c_is_comp; + ( void )a_is_single; ( void )a_is_double; + ( void )b_is_single; ( void )b_is_double; + ( void )c_is_single; ( void )c_is_double; + ( void )comp_single; ( void )comp_double; + + if ( + //( c_is_comp && a_is_comp && b_is_real ) || + //( c_is_comp && a_is_real && b_is_comp ) || + //( c_is_real && a_is_comp && b_is_comp ) || + //( c_is_comp && a_is_real && b_is_real ) || + //( c_is_real && a_is_comp && b_is_real ) || + //( c_is_real && a_is_real && b_is_comp ) || + //FALSE + TRUE + ) + { + if ( + ( c_is_single && a_is_single && b_is_single && mixeddomain ) || + ( c_is_single && a_is_single && b_is_single && comp_single ) || + ( c_is_single && a_is_single && b_is_single && comp_double ) || + ( c_is_single && a_is_single && b_is_double ) || + ( c_is_single && a_is_double && b_is_single ) || + ( c_is_double && a_is_single && b_is_single ) || + ( c_is_single && a_is_double && b_is_double ) || + ( c_is_double && a_is_single && b_is_double ) || + ( c_is_double && a_is_double && b_is_single ) || + ( c_is_double && a_is_double && b_is_double && comp_single ) || + ( c_is_double && a_is_double && b_is_double && comp_double ) || + ( c_is_double && a_is_double && b_is_double && mixeddomain ) || + FALSE + ) + bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); + else + bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); + } + else + bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); + return; + } +#else +#if 0 + // If any of the storage datatypes differ, or if the execution precision + // differs from the storage precision of C, utilize the mixed datatype + // code path. + // NOTE: We could check the exec dt against the storage dt of C, but for + // now we don't support the caller setting the execution domain + // explicitly. + if ( bli_obj_dt( a ) != bli_obj_dt( b ) || + bli_obj_dt( a ) != bli_obj_dt( c ) || + bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) + { + bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); + return; + } +#endif +#endif + diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 1967c6ce4..1aa032ad9 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -69,6 +69,22 @@ void bli_gemm_ker_var2 thrinfo_t* thread ) { +#ifdef BLIS_ENABLE_GEMM_MD + // By now, A and B have been packed and cast to the execution precision. + // In most cases, such as when storage precision of C differs from the + // execution precision, we utilize the mixed datatype code path. However, + // a few cases still fall within this kernel, such as mixed domain with + // equal precision (ccr, crc, rcc), hence those expressions being disabled + // in the conditional below. + if ( //( bli_obj_domain( c ) != bli_obj_domain( a ) ) || + //( bli_obj_domain( c ) != bli_obj_domain( b ) ) || + ( bli_obj_dt( c ) != bli_obj_exec_dt( c ) ) ) + { + bli_gemm_ker_var2_md( a, b, c, cntx, rntm, cntl, thread ); + return; + } +#endif + num_t dt_exec = bli_obj_exec_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); @@ -112,12 +128,12 @@ void bli_gemm_ker_var2 buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); - // If 1m is being employed on a column- or row-stored matrix with a - // real-valued beta, we can use the real domain macro-kernel, which + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. #if 1 - if ( bli_is_1m_packed( schema_a ) ) + if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_l3_ind_recast_1m_params ( @@ -132,6 +148,22 @@ void bli_gemm_ker_var2 } #endif +#ifdef BLIS_ENABLE_GEMM_MD + // Tweak parameters in select mixed domain cases cases. + bli_gemm_md_ker_var2_recast + ( + &dt_exec, + bli_obj_dt( a ), + bli_obj_dt( b ), + bli_obj_dt( c ), + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + c, + &rs_c, &cs_c + ); +#endif + // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; @@ -267,6 +299,9 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c new file mode 100644 index 000000000..e414722b9 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var2_md.c @@ -0,0 +1,405 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_GEMM_MD + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T) + ( + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY2_ALL(ftypes,gemm_ker_var2_md); + + +void bli_gemm_ker_var2_md + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + num_t dt_c = bli_obj_dt( c ); + + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + obj_t scalar_a; + obj_t scalar_b; + + void* buf_alpha; + void* buf_beta; + + FUNCPTR_T f; + + // Detach and multiply the scalars attached to A and B. + // NOTE: We know that the internal scalars of A and B are already of the + // target datatypes because the necessary typecasting would have already + // taken place during bli_packm_init(). + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + // NOTE: We know that scalar_b is of type dt_exec due to the above code + // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, + // and we know that the internal scalar in C is already of the type dt_c + // due to the casting in the implementation of bli_obj_scalar_attach(). + buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Tweak parameters in select mixed domain cases cases. + bli_gemm_md_ker_var2_recast + ( + &dt_exec, + bli_obj_dt( a ), + bli_obj_dt( b ), + bli_obj_dt( c ), + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + c, + &rs_c, &cs_c + ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_c][dt_exec]; + + // Invoke the function. + f( schema_a, + schema_b, + m, + n, + k, + buf_alpha, + buf_a, cs_a, is_a, + pd_a, ps_a, + buf_b, rs_b, is_b, + pd_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_c, ctype_e, chc, che, varname ) \ +\ +void PASTEMAC2(chc,che,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dte = PASTEMAC(che,type); \ + /*const num_t dtc = PASTEMAC(chc,type);*/ \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(che,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dte, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype_e ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_e ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dte, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype_e* restrict zero = PASTEMAC(che,0); \ + ctype_e* restrict a_cast = a; \ + ctype_e* restrict b_cast = b; \ + ctype_c* restrict c_cast = c; \ + ctype_e* restrict alpha_cast = alpha; \ + ctype_c* restrict beta_cast = beta; \ + ctype_e* restrict b1; \ + ctype_c* restrict c1; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + dim_t m_cur; \ + dim_t n_cur; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(che,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ + dim_t jr_num_threads = bli_thread_n_way( thread ); \ + dim_t jr_thread_id = bli_thread_work_id( thread ); \ + dim_t ir_num_threads = bli_thread_n_way( caucus ); \ + dim_t ir_thread_id = bli_thread_work_id( caucus ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ + { \ + ctype_e* restrict a1; \ + ctype_c* restrict c11; \ + ctype_e* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ + { \ + ctype_e* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ + if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ + if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Always save the micropanel product to the local microtile and + then accumulate it into C via the xpbys_mxn macro. */ \ + /*if ( 1 )*/ \ + { \ + /*bli_auxinfo_set_dt_on_output( dte, &aux );*/ \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the microtile of C and add the result from above. */ \ + PASTEMAC3(che,chc,chc,xpbys_mxn) \ + ( \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c \ + ); \ + } \ +/* + else if ( m_cur == MR && n_cur == NR ) \ + { \ + bli_auxinfo_set_dt_on_output( dtc, &aux ); \ +\ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + ( ctype_e* )beta_cast, \ + ( ctype_e* )c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + bli_auxinfo_set_dt_on_output( dte, &aux ); \ +\ + gemm_ukr \ + ( \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + PASTEMAC3(che,chc,chc,xpbys_mxn) \ + ( \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c \ + ); \ + } \ +*/ \ + } \ + } \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNC2_BASIC0( gemm_ker_var2_md ) +INSERT_GENTFUNC2_MIXDP0( gemm_ker_var2_md ) + +#endif diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c new file mode 100644 index 000000000..9ccb49225 --- /dev/null +++ b/frame/3/gemm/bli_gemm_md.c @@ -0,0 +1,901 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_GEMM_MD + +void bli_gemm_md + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + const bool_t a_is_real = bli_obj_is_real( a ); + const bool_t a_is_comp = bli_obj_is_complex( a ); + const bool_t b_is_real = bli_obj_is_real( b ); + const bool_t b_is_comp = bli_obj_is_complex( b ); + const bool_t c_is_real = bli_obj_is_real( c ); + const bool_t c_is_comp = bli_obj_is_complex( c ); + + if ( c_is_real && a_is_real && b_is_real ) + { + // C_real += A_real * B_real + doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_comp && a_is_comp && b_is_comp ) + { + // C_complex += A_complex * B_complex + doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_comp && a_is_comp && b_is_real ) + { + // C_complex += A_complex * B_real + doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_comp && a_is_real && b_is_comp ) + { + // C_complex += A_real * B_complex + doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_real && a_is_comp && b_is_comp ) + { + // C_real += A_complex * B_complex + doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_comp && a_is_real && b_is_real ) + { + // C_complex += A_real * B_real + doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_real && a_is_comp && b_is_real ) + { + // C_real += A_complex * B_real + doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx ); + } + else if ( c_is_real && a_is_real && b_is_comp ) + { + // C_real += A_real * B_complex + doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx ); + } + else + { + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // This should never execute. + bli_abort(); + } + + // Extract the computation and execution domains from the struct + // returned above. + dom_t dom_comp = doms.comp; + dom_t dom_exec = doms.exec; + + // Inspect the computation precision of C. (The user may have set + // this explicitly to request the precision in which the computation + // should take place.) + prec_t prec_comp = bli_obj_comp_prec( c ); + + // The computation precision tells us the target precision of A and B. + // NOTE: We don't set the target domain here. The target domain would + // either be unchanged, or would have been changed in one of the eight + // domain cases above. + bli_obj_set_target_prec( prec_comp, a ); + bli_obj_set_target_prec( prec_comp, b ); + + // Combine the execution domain with the computation precision to form + // the execution datatype. (The computation precision and execution + // precision are always equal.) + num_t dt_exec = dom_exec | prec_comp; + + // Set the execution datatypes of A, B, and C. + bli_obj_set_exec_dt( dt_exec, a ); + bli_obj_set_exec_dt( dt_exec, b ); + bli_obj_set_exec_dt( dt_exec, c ); + + // Combine the computation precision and computation domain to form the + // computation datatype. + num_t dt_comp = dom_comp | prec_comp; + + // Set the computation datatypes of A, B, and C. + bli_obj_set_comp_dt( dt_comp, a ); + bli_obj_set_comp_dt( dt_comp, b ); + bli_obj_set_comp_dt( dt_comp, c ); + + + +#if 0 + if ( bli_obj_is_single_prec( c ) ) printf( "%% --> s += " ); + else printf( "%% --> d += " ); + if ( bli_obj_is_single_prec( a ) ) printf( "s " ); + else printf( "d " ); + if ( bli_obj_is_single_prec( b ) ) printf( "s\n" ); + else printf( "d\n" ); + + //if ( bli_obj_is_scomplex( a ) && + // bli_obj_is_dcomplex( b ) && + // bli_obj_is_float( c ) ) + { + printf( "bli_gemm_md(): stor precs after: %d %d %d\n", bli_obj_prec( a ), + bli_obj_prec( b ), bli_obj_prec( c ) ); + printf( "bli_gemm_md(): targ precs after: %d %d %d\n", bli_obj_target_prec( a ), + bli_obj_target_prec( b ), bli_obj_target_prec( c ) ); + printf( "bli_gemm_md(): exec precs after: %d %d %d\n", bli_obj_exec_prec( a ), + bli_obj_exec_prec( b ), bli_obj_exec_prec( c ) ); + printf( "bli_gemm_md(): stor domain after: %d %d %d\n", bli_obj_domain( a ), + bli_obj_domain( b ), bli_obj_domain( c ) ); + printf( "bli_gemm_md(): targ domain after: %d %d %d\n", bli_obj_target_domain( a ), + bli_obj_target_domain( b ), bli_obj_target_domain( c ) ); + printf( "bli_gemm_md(): exec domain after: %d %d %d\n", bli_obj_exec_domain( a ), + bli_obj_exec_domain( b ), bli_obj_exec_domain( c ) ); + } +#endif +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_ccr + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + // We assume that the requested computation domain is complex. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_COMPLEX; + + // For ccr, the computation (ukernel) will be real, but the execution + // will appear complex to other parts of the implementation. + doms.comp = BLIS_REAL; + doms.exec = BLIS_COMPLEX; + + // Here we construct the computation datatype, which for the ccr case + // is equal to the real projection of the execution datatype, and use + // that computation datatype to query the corresponding ukernel output + // preference. + const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); + const bool_t row_pref + = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); + + // We can only perform this case of mixed-domain gemm, C += A*B where + // B is real, if the microkernel prefers column output. If it prefers + // row output, we must induce a transposition and perform C += A*B + // where A (formerly B) is real. + if ( row_pref ) + { + bli_obj_swap( a, b ); + + bli_obj_induce_trans( a ); + bli_obj_induce_trans( b ); + bli_obj_induce_trans( c ); + + return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); + } + + // Create a local copy of the context and then prepare to use this + // context instead of the one passed in. + *cntx_local = **cntx; + *cntx = cntx_local; + + // Copy the real domain blocksizes into the slots of their complex + // counterparts. + blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); + blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); + blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); + blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); + blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + + // Halve both the real and complex MR's (which are both real MR's). + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr ); + + // Halve both the real and complex MC's (which are both real MC's). + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc ); + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + + // Rather than check which complex datatype dt_comp refers to, we set + // the mixed-domain virtual microkernel for both types. + bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); + bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_crc + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + // We assume that the requested computation domain is complex. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_COMPLEX; + + // For crc, the computation (ukernel) will be real, but the execution + // will appear complex to other parts of the implementation. + doms.comp = BLIS_REAL; + doms.exec = BLIS_COMPLEX; + + // Here we construct the computation datatype, which for the crc case + // is equal to the real projection of the execution datatype, and use + // that computation datatype to query the corresponding ukernel output + // preference. + const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); + const bool_t col_pref + = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); + + // We can only perform this case of mixed-domain gemm, C += A*B where + // A is real, if the microkernel prefers row output. If it prefers + // column output, we must induce a transposition and perform C += A*B + // where B (formerly A) is real. + if ( col_pref ) + { + bli_obj_swap( a, b ); + + bli_obj_induce_trans( a ); + bli_obj_induce_trans( b ); + bli_obj_induce_trans( c ); + + return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx ); + } + + // Create a local copy of the context and then prepare to use this + // context instead of the one passed in. + *cntx_local = **cntx; + *cntx = cntx_local; + + // Copy the real domain blocksizes into the slots of their complex + // counterparts. + blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); + blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); + blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); + blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); + blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + + // Halve both the real and complex NR's (which are both real NR's). + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr ); + + // Halve both the real and complex NC's (which are both real NC's). + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc ); + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + + // Rather than check which complex datatype dt_comp refers to, we set + // the mixed-domain virtual microkernel for both types. + bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); + bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_rcc + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + // We assume that the requested computation domain is complex. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_COMPLEX; + + // For rcc, the computation (ukernel) will be real, and since the output + // matrix C is also real, so must be the execution domain. + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // Create a local copy of the context and then prepare to use this + // context instead of the one passed in. + *cntx_local = **cntx; + *cntx = cntx_local; + + // Copy the real domain blocksizes into the slots of their complex + // counterparts. + blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); + blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); + blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); + blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); + blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + + bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + + // Halve both the real and complex KC's (which are both real KC's). + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc ); + + // Use the 1r pack schema for both A and B with the conjugation + // of A or B toggled (to produce ar * br - ai * bi). + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b ); + + bli_obj_toggle_conj( b ); + + // We also need to copy over the packm kernels from the 1m + // context. We query the address of that context here. + const num_t dt_comp = bli_obj_dt( a ); + cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_comp ); + + func_t* cntx_funcs = bli_cntx_packm_kers_buf( *cntx ); + func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m ); + + for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i ) + { + cntx_funcs[ i ] = cntx_1m_funcs[ i ]; + } + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_crr + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; +#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + obj_t c_real; +#endif + + // We assume that the requested computation domain is real. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_REAL; + + // For crr, the computation (ukernel) will be real, and since we will + // be updating only the real part of the output matrix C, the exectuion + // domain is also real. + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // Since the A*B product is real, we can update only the real part of + // C. Thus, we convert the obj_t for the complex matrix to one that + // represents only the real part. HOWEVER, there are two situations in + // which we forgo this trick: + // - If extra memory optimizations are enabled, we should leave C alone + // since we'll be computing A*B to a temporary matrix and accumulating + // that result back to C, and in order for that to work, we need to + // allow that code to continue accessing C as a complex matrix. + // - Even if extra memory optimizations are diabled, logically projecting + // C as a real matrix can still cause problems if beta is non-unit. In + // that situation, the implementation won't get a chance to scale the + // imaginary components of C by beta, and thus it would compute the + // wrong answer. Thus, if beta is non-unit, we must leave C alone. +#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM + if ( bli_obj_equals( beta, &BLIS_ONE ) ) + { + bli_obj_real_part( c, &c_real ); + + // Overwrite the complex obj_t with its real-only alias. + *c = c_real; + } +#endif + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_rcr + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + obj_t a_real; + + // We assume that the requested computation domain is real. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_REAL; + + // For rcr, the computation (ukernel) will be real, and since the output + // matrix C is also real, so must be the execution domain. + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // Convert the obj_t for the complex matrix to one that represents only + // the real part. + bli_obj_real_part( a, &a_real ); + + // Overwrite the complex obj_t with its real-only alias. + *a = a_real; + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_rrc + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + obj_t b_real; + + // We assume that the requested computation domain is real. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_REAL; + + // For rcr, the computation (ukernel) will be real, and since the output + // matrix C is also real, so must be the execution domain. + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // Convert the obj_t for the complex matrix to one that represents only + // the real part. + bli_obj_real_part( b, &b_real ); + + // Overwrite the complex obj_t with its real-only alias. + *b = b_real; + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_rrr + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + // We assume that the requested computation domain is real. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_REAL; + + // For rrr, the computation (ukernel) and execution domains are both + // real. + doms.comp = BLIS_REAL; + doms.exec = BLIS_REAL; + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +// cab +mddm_t bli_gemm_md_ccc + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ) +{ + mddm_t doms; + + // We assume that the requested computation domain is complex. + //dom_t dom_comp_in = bli_obj_comp_domain( c ); + //dom_t dom_comp_in = BLIS_COMPLEX; + + // For ccc, the computation (ukernel) and execution domains are both + // complex. + doms.comp = BLIS_COMPLEX; + doms.exec = BLIS_COMPLEX; + + // Set the pack schemas of objects A and B for normal execution. + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, a ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, b ); + + // Return the computation and execution domains. + return doms; +} + +// ----------------------------------------------------------------------------- + +void bli_gemm_md_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ) +{ + bli_init_once(); + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // If alpha is zero, scale by beta and return. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + } + + cntx_t cntx_local; + + // Handle mixed domain cases in bli_gemm_md(), which may modify + // the objects or the context. (If the context is modified, cntx + // is adjusted to point to cntx_local.) + bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx ); + + // Record the threading for each level within the context. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + bli_gemm_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + rntm, + cntl + ); +} + +// ----------------------------------------------------------------------------- + +void bli_gemm_md_zgemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ) +{ + bli_init_once(); + + obj_t a_local; + obj_t b_local; + obj_t c_local; + +#if 1 + obj_t am, bm, cm; + obj_t* c_orig; + + //if ( is_md == TRUE ) + { + //num_t dt_c2 = bli_obj_dt( c ); + //num_t dt_c1 = bli_dt_proj_to_complex( dt_c2 ); + //num_t dt_c = bli_dt_proj_to_double_prec( dt_c1 ); + //num_t dt_c = bli_obj_dt_proj_to_complex( c ); + num_t dt_c = BLIS_DCOMPLEX; + + if ( bli_obj_is_single_prec( c ) ) dt_c = BLIS_SCOMPLEX; + else dt_c = BLIS_DCOMPLEX; + + if ( bli_obj_is_real( a ) && + bli_obj_is_real( b ) && + bli_obj_is_real( c ) ) dt_c = bli_dt_proj_to_real( dt_c ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width_after_trans( a ); + + bli_obj_create( dt_c, m, k, 0, 0, &am ); + bli_obj_create( dt_c, k, n, 0, 0, &bm ); + bli_obj_create( dt_c, m, n, 0, 0, &cm ); + + //bli_projm( a, &am ); + //bli_projm( b, &bm ); + //bli_projm( c, &cm ); + bli_castm( a, &am ); + bli_castm( b, &bm ); + bli_castm( c, &cm ); + + c_orig = c; + + a = &am; + b = &bm; + c = &cm; + } +#endif + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // If alpha is zero, scale by beta and return. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + } + + { + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + bli_gemm_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + rntm, + cntl + ); + +#if 1 + //if ( is_md == TRUE ) + { + //bli_projm( &cm, c_orig ); + bli_castm( &cm, c_orig ); + + bli_obj_free( &am ); + bli_obj_free( &bm ); + bli_obj_free( &cm ); + } +#endif +} + +#endif diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h new file mode 100644 index 000000000..ec88ebff1 --- /dev/null +++ b/frame/3/gemm/bli_gemm_md.h @@ -0,0 +1,327 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm_md_c2r_ref.h" + +// Define a local struct type that makes returning two values easier. +typedef struct mddm_s +{ + dom_t comp; + dom_t exec; +} mddm_t; + +void bli_gemm_md + ( + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + cntx_t** cntx + ); +mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); + +// ----------------------------------------------------------------------------- + +void bli_gemm_md_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + +void bli_gemm_md_zgemm + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + +// ----------------------------------------------------------------------------- + +static bool_t bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) +{ + bool_t r_val = FALSE; + + // NOTE: The last conditional subexpression is necessary if/when we + // allow the user to specify the computation domain. (The computation + // domain is currently ignored, but once it is honored as a user- + // settable value, it will affect the execution domain, which is what + // is checked below. Until then, the last expression is not actually + // necessary since crr is already unconditionally associated with an + // execution domain of BLIS_REAL.) + if ( bli_obj_is_complex( c ) && + bli_obj_is_real( a ) && + bli_obj_is_real( b ) && + bli_obj_exec_domain( c ) == BLIS_REAL ) + r_val = TRUE; + + return r_val; +} + +static bool_t bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) +{ + bool_t r_val = FALSE; + + // NOTE: The last conditional subexpression is necessary if/when we + // allow the user to specify the computation domain. (The computation + // domain is currently ignored, but once it is honored as a user- + // settable value, it will affect the execution domain, which is what + // is checked below. Until then, the last expression is not actually + // necessary since ccr is already unconditionally associated with an + // execution domain of BLIS_COMPLEX.) + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_real( b ) && + bli_obj_exec_domain( c ) == BLIS_COMPLEX ) + r_val = TRUE; + + return r_val; +} + +static bool_t bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) +{ + bool_t r_val = FALSE; + + // NOTE: The last conditional subexpression is necessary if/when we + // allow the user to specify the computation domain. (The computation + // domain is currently ignored, but once it is honored as a user- + // settable value, it will affect the execution domain, which is what + // is checked below. Until then, the last expression is not actually + // necessary since crc is already unconditionally associated with an + // execution domain of BLIS_COMPLEX.) + if ( bli_obj_is_complex( c ) && + bli_obj_is_real( a ) && + bli_obj_is_complex( b ) && + bli_obj_exec_domain( c ) == BLIS_COMPLEX ) + r_val = TRUE; + + return r_val; +} + +// ----------------------------------------------------------------------------- + +static void bli_gemm_md_ker_var2_recast + ( + num_t* dt_comp, + num_t dt_a, + num_t dt_b, + num_t dt_c, + dim_t* m, + dim_t* n, + dim_t* k, + inc_t* pd_a, inc_t* ps_a, + inc_t* pd_b, inc_t* ps_b, + obj_t* c, + inc_t* rs_c, inc_t* cs_c + ) +{ + if ( bli_is_real( dt_c ) && + bli_is_complex( dt_a ) && + bli_is_complex( dt_b ) ) + { + // The rcc case is executed with a real macrokernel, so we need to + // double the k dimension (because both A and B are packed to the 1r + // schema), and also the panel strides of A and B since they were + // packed as complex matrices and we now need to convert them to + // units of real elements. + *k *= 2; + *ps_a *= 2; + *ps_b *= 2; + } + else if ( bli_is_complex( dt_c ) && + bli_is_real( dt_a ) && + bli_is_complex( dt_b ) ) + { +#if 1 + obj_t beta; + + bli_obj_scalar_detach( c, &beta ); + + if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && + bli_obj_imag_is_zero( &beta ) && + bli_is_row_stored( *rs_c, *cs_c ) && + bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) + { + // If beta is real, and C is not general-stored, and the computation + // precision is equal to the storage precision of C, we can use the + // real macrokernel (and real microkernel, which is already stored + // to the real virtual microkernel slots of the context) instead of + // the complex macrokernel and c2r virtual microkernel. + *dt_comp = bli_dt_proj_to_real( *dt_comp ); + *n *= 2; + *pd_b *= 2; *ps_b *= 2; + *rs_c *= 2; + } + else +#endif + { + // Generally speaking, the crc case is executed with a complex + // macrokernel, so we need to halve the panel stride of A (which + // is real) since the macrokernel will perform the pointer + // arithmetic in units of complex elements. + *ps_a /= 2; + } + } + else if ( bli_is_complex( dt_c ) && + bli_is_complex( dt_a ) && + bli_is_real( dt_b ) ) + { +#if 1 + obj_t beta; + + bli_obj_scalar_detach( c, &beta ); + + if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && + bli_obj_imag_is_zero( &beta ) && + bli_is_col_stored( *rs_c, *cs_c ) && + bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) + { + // If beta is real, and C is not general-stored, and the computation + // precision is equal to the storage precision of C, we can use the + // real macrokernel (and real microkernel, which is already stored + // to the real virtual microkernel slots of the context) instead of + // the complex macrokernel and c2r virtual microkernel. + *dt_comp = bli_dt_proj_to_real( *dt_comp ); + *m *= 2; + *pd_a *= 2; *ps_a *= 2; + *cs_c *= 2; + } + else +#endif + { + // Generally speaking, the ccr case is executed with a complex + // macrokernel, so we need to halve the panel stride of B (which + // is real) since the macrokernel will perform the pointer + // arithmetic in units of complex elements. + *ps_b /= 2; + } + } +#if 0 + else if ( bli_is_real( dt_c ) && + bli_is_real( dt_a ) && + bli_is_real( dt_b ) ) + { + // No action needed. +//printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); + } + else if ( bli_is_complex( dt_c ) && + bli_is_real( dt_a ) && + bli_is_real( dt_b ) ) + { + // No action needed. + } + else if ( bli_is_real( dt_c ) && + bli_is_complex( dt_a ) && + bli_is_real( dt_b ) ) + { + // No action needed. + } + else if ( bli_is_real( dt_c ) && + bli_is_real( dt_a ) && + bli_is_complex( dt_b ) ) + { + // No action needed. + } +#endif +} + +// ----------------------------------------------------------------------------- + +// +// Prototype object-based interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ + ); + +GENPROT( gemm_ker_var2_md ) + +// +// Prototype BLAS-like interfaces with void pointer operands. +// + +#undef GENTPROT2 +#define GENTPROT2( ctype_c, ctype_e, chc, che, varname ) \ +\ +void PASTEMAC2(chc,che,varname) \ + ( \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); + +INSERT_GENTPROT2_BASIC0( gemm_ker_var2_md ) +INSERT_GENTPROT2_MIXDP0( gemm_ker_var2_md ) + diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c new file mode 100644 index 000000000..f1479b5b1 --- /dev/null +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c @@ -0,0 +1,223 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_GEMM_MD + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \ +\ +void PASTEMAC2(ch,opname,suf) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t row_pref = !col_pref; \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ +/* + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +*/ \ +\ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "gemm_ukr: a", 2*mr, k, \ + a_r, 1, 6, "%5.2f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "gemm_ukr: b", k, nr, \ + b_r, 8, 1, "%5.2f", "" ); \ +*/ \ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ +/* + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +*/ \ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should ONLY occur in + the context of trsm, whereby this virtual micro-kernel is called + directly from the trsm macro-kernel to update the micro-tile b11 + that exists within the packed row-panel of B. Indeed that is the + reason those cases MUST be explicitly handled. */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* In the atypical cases, we compute the result into temporary + workspace ct and then accumulated it back to c at the end. */ \ +\ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ + dim_t i, j; \ +\ + /* Accumulate the final result in ct back to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ + else /*if ( !PASTEMAC(ch,eq1)( *beta ) )*/ \ + { \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ + } \ + else \ + { \ + /* In the typical cases, we use the real part of beta and + accumulate directly into the output matrix c. */ \ +\ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm_md_c2r, BLIS_REF_SUFFIX ) + +#endif diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.h b/frame/3/gemm/bli_gemm_md_c2r_ref.h new file mode 100644 index 000000000..fa5893e2f --- /dev/null +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// -- Level-3 native micro-kernel prototype redefinitions ---------------------- + +#undef gemm_ukr_name +#define gemm_ukr_name gemm_md_c2r_ref + +// Include the native micro-kernel API template. +#include "bli_l3_ukr.h" diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 93c014051..f45542d37 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -278,6 +278,9 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 5875c3317..3061a5c39 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -278,6 +278,9 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index ff64501aa..eef104eed 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -316,6 +316,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index bfe57ba16..23dd22cb8 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -323,6 +323,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index e2eef964e..ae44e8ff9 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -323,6 +323,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index c76bc535f..9d7ec4cfe 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -324,6 +324,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 34fc6a2b6..021f8baf2 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -338,6 +338,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 78e2a7a15..0ddcd16d4 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -346,6 +346,9 @@ void PASTEMAC(ch,varname) \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 8045fe09d..1cf456678 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -368,6 +368,9 @@ void PASTEMAC(ch,varname) \ NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index e1279813c..b5a76d03a 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -363,6 +363,9 @@ void PASTEMAC(ch,varname) \ NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ b1 = b_cast; \ c1 = c_cast; \ diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h index 09595968e..cf31ffa0f 100644 --- a/frame/base/bli_auxinfo.h +++ b/frame/base/bli_auxinfo.h @@ -65,10 +65,12 @@ static inc_t bli_auxinfo_is_b( auxinfo_t* ai ) return ai->is_b; } +#if 0 static inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai ) { return ai->dt_on_output; } +#endif // auxinfo_t field modification @@ -105,10 +107,12 @@ static void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) ai->is_b = is; } +#if 0 static void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai ) { ai->dt_on_output = dt_on_output; } +#endif #endif diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 3cfc6c39c..f8ed2663f 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -393,7 +393,9 @@ static bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cnt static bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { - const num_t dt = bli_obj_dt( obj ); + // Note that we use the computation datatype, which may differ from the + // storage datatype of C (when performing a mixed datatype operation). + const num_t dt = bli_obj_comp_dt( obj ); const bool_t ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool_t ukr_prefers_cols @@ -442,9 +444,9 @@ static bool_t bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cnt static bool_t bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { - // Note that we use the execution datatype, which may differ from the - // storage datatype of C (though this would happen in very few situations). - const num_t dt = bli_obj_exec_dt( obj ); + // Note that we use the computation datatype, which may differ from the + // storage datatype of C (when performing a mixed datatype operation). + const num_t dt = bli_obj_comp_dt( obj ); const bool_t ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool_t ukr_prefers_cols diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 407f31cb5..722b5b93a 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -108,6 +108,7 @@ void bli_obj_create_without_buffer bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); + bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); @@ -115,8 +116,14 @@ void bli_obj_create_without_buffer // Set the internal scalar to 1.0. s = bli_obj_internal_scalar_buffer( obj ); - if ( bli_is_float( dt ) ) { bli_sset1s( *(( float* )s) ); } - else if ( bli_is_double( dt ) ) { bli_dset1s( *(( double* )s) ); } + // Always writing the imaginary component is needed in mixed-domain + // scenarios. Failing to do this can lead to reading uninitialized + // memory just before calling the macrokernel (as the internal scalars + // for A and B are merged). + //if ( bli_is_float( dt ) ) { bli_sset1s( *(( float* )s) ); } + //else if ( bli_is_double( dt ) ) { bli_dset1s( *(( double* )s) ); } + if ( bli_is_float( dt ) ) { bli_cset1s( *(( scomplex* )s) ); } + else if ( bli_is_double( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); } else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); } else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); } } diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c index 73446830f..b4091a38f 100644 --- a/frame/base/bli_query.c +++ b/frame/base/bli_query.c @@ -86,6 +86,7 @@ bool_t bli_obj_equals( obj_t* a, bool_t bli_obj_imag_equals( obj_t* a, obj_t* b ) { +#if 0 bool_t r_val = FALSE; num_t dt_a; num_t dt_b; @@ -128,7 +129,51 @@ bool_t bli_obj_imag_equals( obj_t* a, r_val = bli_deq( bli_zimag( *ap_z ), *bp_z ); } } +#endif + bool_t r_val = FALSE; + + // The function is not yet implemented for vectors and matrices. + if ( !bli_obj_is_1x1( a ) || + !bli_obj_is_1x1( b ) || + bli_obj_is_complex( b ) ) + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + double a_r, a_i; + double b_r, b_i; + + // Get the real and imaginary parts of a and cast them to local doubles. + bli_getsc( a, &a_r, &a_i ); + + // Get the value of b and cast to a local double. (Note: the imaginary part + // of b is ignored since we know b is real.) + bli_getsc( b, &b_r, &b_i ); + + // Compare the imaginary part of a to the real part of b. + if ( a_i == b_r ) r_val = TRUE; return r_val; } +bool_t bli_obj_imag_is_zero( obj_t* a ) +{ + bool_t r_val = TRUE; + + // The function is not yet implemented for vectors and matrices. + if ( !bli_obj_is_1x1( a ) ) + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + if ( bli_obj_is_complex( a ) ) + { + double a_r, a_i; + + // Get the real and imaginary parts and cast them to local doubles. + bli_getsc( a, &a_r, &a_i ); + + // Compare the imaginary part of a to double-precision zero. + if ( !bli_deq0( a_i ) ) r_val = FALSE; + } + + return r_val; +} + + diff --git a/frame/base/bli_query.h b/frame/base/bli_query.h index 827752103..afc01cd5a 100644 --- a/frame/base/bli_query.h +++ b/frame/base/bli_query.h @@ -37,3 +37,5 @@ bool_t bli_obj_equals( obj_t* a, bool_t bli_obj_imag_equals( obj_t* a, obj_t* b ); + +bool_t bli_obj_imag_is_zero( obj_t* a ); diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c new file mode 100644 index 000000000..548e6410f --- /dev/null +++ b/frame/base/cast/bli_castnzm.c @@ -0,0 +1,267 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// NOTE: This is one of the few functions in BLIS that is defined +// with heterogeneous type support. This is done so that we have +// an operation that can be used to typecast (copy-cast) a matrix +// of one datatype to a scalar of another datatype. + +typedef void (*FUNCPTR_T) + ( + trans_t transa, + dim_t m, + dim_t n, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b + ); + +static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm); + +// +// Define object-based interface. +// + +void bli_castnzm + ( + obj_t* a, + obj_t* b + ) +{ + num_t dt_a = bli_obj_dt( a ); + num_t dt_b = bli_obj_dt( b ); + + trans_t transa = bli_obj_conjtrans_status( a ); + + dim_t m = bli_obj_length( b ); + dim_t n = bli_obj_width( b ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a = bli_obj_row_stride( a ); + inc_t cs_a = bli_obj_col_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t cs_b = bli_obj_col_stride( b ); + + FUNCPTR_T f; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_castnzm_check( a, b ); + +#if 0 + if ( bli_obj_dt( a ) == bli_obj_dt( b ) ) + { + // If a and b share the same datatype, we can simply use copym. + bli_copym( a, b ); + return; + } +#endif + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_a][dt_b]; + + // Invoke the void pointer-based function. + f + ( + transa, + m, + n, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b + ); +} + +// ----------------------------------------------------------------------------- + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \ +\ +void PASTEMAC2(cha,chb,opname) \ + ( \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + ctype_a* restrict a_cast = a; \ + ctype_b* restrict b_cast = b; \ + conj_t conja; \ + dim_t n_iter; \ + dim_t n_elem; \ + inc_t lda, inca; \ + inc_t ldb, incb; \ + dim_t j, i; \ +\ + /* Set various loop parameters. */ \ + bli_set_dims_incs_2m \ + ( \ + transa, \ + m, n, rs_a, cs_a, rs_b, cs_b, \ + &n_elem, &n_iter, &inca, &lda, &incb, &ldb \ + ); \ +\ + /* Extract the conjugation component from the transa parameter. */ \ + conja = bli_extract_conj( transa ); \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + if ( inca == 1 && incb == 1 ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copyjnzs)( a1[i], b1[i] ); \ + } \ + } \ + } \ + else \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copyjnzs)( *a1, *b1 ); \ +\ + a1 += inca; \ + b1 += incb; \ + } \ + } \ + } \ + } \ + else \ + { \ + if ( inca == 1 && incb == 1 ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copynzs)( a1[i], b1[i] ); \ + } \ + } \ + } \ + else \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copynzs)( *a1, *b1 ); \ +\ + a1 += inca; \ + b1 += incb; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC2_BASIC0( castnzm ) +INSERT_GENTFUNC2_MIXDP0( castnzm ) + +// ----------------------------------------------------------------------------- + +// +// Define object-based _check() function. +// + +void bli_castnzm_check + ( + obj_t* a, + obj_t* b + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + // Check structure. + // NOTE: We enforce general structure for now in order to simplify the + // implementation. + + bli_check_general_object( a ); + bli_check_error_code( e_val ); + + bli_check_general_object( b ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_conformal_dims( a, b ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h new file mode 100644 index 000000000..7770515b8 --- /dev/null +++ b/frame/base/cast/bli_castnzm.h @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype object-based interface. +// + +void bli_castnzm + ( + obj_t* a, + obj_t* b + ); + +// +// Prototype BLAS-like interfaces with heterogeneous-typed operands. +// + +#undef GENTPROT2 +#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ +\ +void PASTEMAC2(cha,chb,opname) \ + ( \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT2_BASIC0( castnzm ) +INSERT_GENTPROT2_MIXDP0( castnzm ) + +// +// Prototype object-based _check() function. +// + +void bli_castnzm_check + ( + obj_t* a, + obj_t* b + ); + diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c index 4c867b719..9a0a905e4 100644 --- a/frame/compat/bla_scal.c +++ b/frame/compat/bla_scal.c @@ -67,7 +67,7 @@ void PASTEF772(chx,cha,blasname) \ that is, we just always sub-optimally implement those cases by casting alpha to ctype_x (potentially the complex domain) and using the homogeneous datatype instance according to that type. */ \ - PASTEMAC2(cha,chx,cast)( (ftype_a*)alpha, alpha_cast ); \ + PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 8c1f2efc9..efda67931 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -99,6 +99,25 @@ #endif +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Enable mixed datatype support? +#ifdef BLIS_DISABLE_MIXED_DT + #undef BLIS_ENABLE_GEMM_MD +#else + // Default behavior is enabled. + #define BLIS_ENABLE_GEMM_MD +#endif + +// Enable memory-intensive optimizations for mixed datatype support? +#ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM + #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM +#else + // Default behavior is enabled. + #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM +#endif + + // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce diff --git a/frame/include/bli_genarray_macro_defs.h b/frame/include/bli_genarray_macro_defs.h index e288dbac2..556fa7542 100644 --- a/frame/include/bli_genarray_macro_defs.h +++ b/frame/include/bli_genarray_macro_defs.h @@ -65,6 +65,18 @@ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ // -- "Smart" two-operand macro -- +#define GENARRAY_FPA2(tname,op) \ +\ +static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ +{ \ + { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ + { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ + { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ + { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ +} + +// -- "Smart" two-operand macro -- + /* #define GENARRAY2_VFP(arrayname,op) \ \ diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 22b8d6302..57e5e2b3e 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -529,6 +529,52 @@ GENTFUNC2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) +// -- Mixed domain/precision (all) two-operand macro with real projection of first operand -- + +// -- (no auxiliary arguments) -- + +#define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ +\ +GENTFUNC2( float, double, s, d, tfuncname ) \ +GENTFUNC2( float, scomplex, s, c, tfuncname ) \ +GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ +\ +GENTFUNC2( double, float, d, s, tfuncname ) \ +GENTFUNC2( double, scomplex, d, c, tfuncname ) \ +GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ +\ +GENTFUNC2( scomplex, float, c, s, tfuncname ) \ +GENTFUNC2( scomplex, double, c, d, tfuncname ) \ +GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ +\ +GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ +GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ +GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) + + +// -- (one auxiliary argument) -- + +#define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ +\ +GENTFUNC2( float, double, s, d, tfuncname, varname ) \ +GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ +GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ +\ +GENTFUNC2( double, float, d, s, tfuncname, varname ) \ +GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ +GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ +\ +GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ +GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ +GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ +\ +GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ +GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ +GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) + + + + // -- Macros for functions with three primary operands ------------------------- diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h index 71a3fa2b1..f0f9761f5 100644 --- a/frame/include/bli_misc_macro_defs.h +++ b/frame/include/bli_misc_macro_defs.h @@ -136,6 +136,20 @@ static void bli_toggle_bool( bool_t* b ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) +// return C type for char + +#define bli_sctype float +#define bli_dctype double +#define bli_cctype scomplex +#define bli_zctype dcomplex + +// return real proj of C type for char + +#define bli_sctyper float +#define bli_dctyper double +#define bli_cctyper float +#define bli_zctyper double + // return default format specifier for char diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 77fd524dd..acc32ccbb 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -122,13 +122,15 @@ static num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) static bool_t bli_obj_is_real( obj_t* obj ) { return ( bool_t ) - ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL ); + ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && + !bli_obj_is_const( obj ) ); } static bool_t bli_obj_is_complex( obj_t* obj ) { return ( bool_t ) - ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX ); + ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && + !bli_obj_is_const( obj ) ); } static num_t bli_obj_dt_proj_to_real( obj_t* obj ) @@ -179,6 +181,24 @@ static prec_t bli_obj_exec_prec( obj_t* obj ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } +static num_t bli_obj_comp_dt( obj_t* obj ) +{ + return ( num_t ) + ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); +} + +static dom_t bli_obj_comp_domain( obj_t* obj ) +{ + return ( dom_t ) + ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); +} + +static prec_t bli_obj_comp_prec( obj_t* obj ) +{ + return ( prec_t ) + ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); +} + static trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) @@ -454,6 +474,24 @@ static void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ); } +static void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) +{ + obj->info = ( objbits_t ) + ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ); +} + +static void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) +{ + obj->info = ( objbits_t ) + ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ); +} + +static void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) +{ + obj->info = ( objbits_t ) + ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ); +} + static void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) @@ -1183,9 +1221,11 @@ static void bli_obj_real_part( obj_t* c, obj_t* r ) const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); + const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); + bli_obj_set_comp_dt( dt_comp_r, r ); // Update the element size. siz_t es_c = bli_obj_elem_size( c ); @@ -1212,9 +1252,11 @@ static void bli_obj_imag_part( obj_t* c, obj_t* i ) const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); + const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); + bli_obj_set_comp_dt( dt_comp_r, i ); // Update the element size. siz_t es_c = bli_obj_elem_size( c ); @@ -1251,13 +1293,24 @@ static void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, v } } -// Swap object contents. +// Swap all object fields (metadata/properties). static void bli_obj_swap( obj_t* a, obj_t* b ) { obj_t t = *b; *b = *a; *a = t; } +// Swap object pack schemas. + +static void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) +{ + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + bli_obj_set_pack_schema( schema_b, a ); + bli_obj_set_pack_schema( schema_a, b ); +} + // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 0c96e7f84..6828bdfe9 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -140,14 +140,15 @@ #include "bli_axmys.h" -#include "bli_cast.h" - #include "bli_conjs.h" #include "bli_copys.h" #include "bli_copyjs.h" #include "bli_copycjs.h" +#include "bli_copynzs.h" +#include "bli_copyjnzs.h" + #include "bli_dots.h" #include "bli_dotjs.h" @@ -191,8 +192,8 @@ // Inlined scalar macros in loops #include "bli_adds_mxn.h" #include "bli_adds_mxn_uplo.h" -#include "bli_copys_mxn.h" #include "bli_set0s_mxn.h" +#include "bli_copys_mxn.h" #include "bli_xpbys_mxn.h" #include "bli_xpbys_mxn_uplo.h" diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index ca1dfa41e..1e5b14a26 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -257,6 +257,10 @@ typedef dcomplex f77_dcomplex; - 1 == Hermitian - 2 == symmetric - 3 == triangular + 31 ~ 29 Execution numerical datatype + - 29: domain (0 == real, 1 == complex) + - 30: precision (0 == single, 1 == double) + - 31: used to encode integer, constant types */ #define BLIS_DATATYPE_SHIFT 0 @@ -286,6 +290,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 +#define BLIS_COMP_DT_SHIFT 29 +#define BLIS_COMP_DOMAIN_SHIFT 29 +#define BLIS_COMP_PREC_SHIFT 30 // // -- BLIS info bit field masks ------------------------------------------------ @@ -318,6 +325,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) +#define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) +#define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) +#define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // @@ -603,13 +613,15 @@ typedef enum typedef enum { - BLIS_3MH = 0, + BLIS_3MH = 0, BLIS_3M1, BLIS_4MH, BLIS_4M1B, BLIS_4M1A, BLIS_1M, - BLIS_NAT + BLIS_NAT, + BLIS_IND_FIRST = 0, + BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) @@ -1003,7 +1015,7 @@ typedef struct inc_t is_b; // The type to convert to on output. - num_t dt_on_output; + //num_t dt_on_output; } auxinfo_t; diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index 0522410be..fcb1f4324 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -671,12 +671,68 @@ #define MOVD(_0, _1) INSTR_(movd, _0, _1) #define MOVL(_0, _1) INSTR_(movl, _0, _1) #define MOVQ(_0, _1) INSTR_(movq, _0, _1) +#define CMOVA(_0, _1) INSTR_(cmova, _0, _1) +#define CMOVAE(_0, _1) INSTR_(cmovae, _0, _1) +#define CMOVB(_0, _1) INSTR_(cmovb, _0, _1) +#define CMOVBE(_0, _1) INSTR_(cmovbe, _0, _1) +#define CMOVC(_0, _1) INSTR_(cmovc, _0, _1) +#define CMOVP(_0, _1) INSTR_(cmovp, _0, _1) +#define CMOVO(_0, _1) INSTR_(cmovo, _0, _1) +#define CMOVS(_0, _1) INSTR_(cmovs, _0, _1) +#define CMOVE(_0, _1) INSTR_(cmove, _0, _1) +#define CMOVZ(_0, _1) INSTR_(cmovz, _0, _1) +#define CMOVG(_0, _1) INSTR_(cmovg, _0, _1) +#define CMOVGE(_0, _1) INSTR_(cmovge, _0, _1) +#define CMOVL(_0, _1) INSTR_(cmovl, _0, _1) +#define CMOVLE(_0, _1) INSTR_(cmovle, _0, _1) +#define CMOVNA(_0, _1) INSTR_(cmovna, _0, _1) +#define CMOVNAE(_0, _1) INSTR_(cmovnae, _0, _1) +#define CMOVNB(_0, _1) INSTR_(cmovnb, _0, _1) +#define CMOVNBE(_0, _1) INSTR_(cmovnbe, _0, _1) +#define CMOVNC(_0, _1) INSTR_(cmovnc, _0, _1) +#define CMOVNP(_0, _1) INSTR_(cmovnp, _0, _1) +#define CMOVNO(_0, _1) INSTR_(cmovno, _0, _1) +#define CMOVNS(_0, _1) INSTR_(cmovns, _0, _1) +#define CMOVNE(_0, _1) INSTR_(cmovne, _0, _1) +#define CMOVNZ(_0, _1) INSTR_(cmovnz, _0, _1) +#define CMOVNG(_0, _1) INSTR_(cmovng, _0, _1) +#define CMOVNGE(_0, _1) INSTR_(cmovnge, _0, _1) +#define CMOVNL(_0, _1) INSTR_(cmovnl, _0, _1) +#define CMOVNLE(_0, _1) INSTR_(cmovnle, _0, _1) #define lea(_0, _1) LEA(_0, _1) #define mov(_0, _1) MOV(_0, _1) #define movd(_0, _1) MOVD(_0, _1) #define movl(_0, _1) MOVL(_0, _1) #define movq(_0, _1) MOVQ(_0, _1) +#define cmova(_0, _1) CMOVA(_0, _1) +#define cmovae(_0, _1) CMOVAE(_0, _1) +#define cmovb(_0, _1) CMOVB(_0, _1) +#define cmovbe(_0, _1) CMOVBE(_0, _1) +#define cmovc(_0, _1) CMOVC(_0, _1) +#define cmovp(_0, _1) CMOVP(_0, _1) +#define cmovo(_0, _1) CMOVO(_0, _1) +#define cmovs(_0, _1) CMOVS(_0, _1) +#define cmove(_0, _1) CMOVE(_0, _1) +#define cmovz(_0, _1) CMOVZ(_0, _1) +#define cmovg(_0, _1) CMOVG(_0, _1) +#define cmovge(_0, _1) CMOVGE(_0, _1) +#define cmovl(_0, _1) CMOVL(_0, _1) +#define cmovle(_0, _1) CMOVLE(_0, _1) +#define cmovna(_0, _1) CMOVNA(_0, _1) +#define cmovnae(_0, _1) CMOVNAE(_0, _1) +#define cmovnb(_0, _1) CMOVNB(_0, _1) +#define cmovnbe(_0, _1) CMOVNBE(_0, _1) +#define cmovnc(_0, _1) CMOVNC(_0, _1) +#define cmovnp(_0, _1) CMOVNP(_0, _1) +#define cmovno(_0, _1) CMOVNO(_0, _1) +#define cmovns(_0, _1) CMOVNS(_0, _1) +#define cmovne(_0, _1) CMOVNE(_0, _1) +#define cmovnz(_0, _1) CMOVNZ(_0, _1) +#define cmovng(_0, _1) CMOVNG(_0, _1) +#define cmovnge(_0, _1) CMOVNGE(_0, _1) +#define cmovnl(_0, _1) CMOVNL(_0, _1) +#define cmovnle(_0, _1) CMOVNLE(_0, _1) // Vector moves @@ -1038,6 +1094,28 @@ #define v4fnmaddss(_0, _1, _2) V4FNMADDSS(_0, _1, _2) #define v4fnmaddps(_0, _1, _2) V4FNMADDPS(_0, _1, _2) +// Conversions + +#define CVTSS2SD(_0, _1) INSTR_(cvtss2sd, _0, _1) +#define CVTSD2SS(_0, _1) INSTR_(cvtsd2ss, _0, _1) +#define CVTPS2PD(_0, _1) INSTR_(cvtps2pd, _0, _1) +#define CVTPD2PS(_0, _1) INSTR_(cvtpd2ps, _0, _1) + +#define cvtss2sd(_0, _1) CVTSS2SD(_0, _1) +#define cvtsd2ss(_0, _1) CVTSD2SS(_0, _1) +#define cvtps2pd(_0, _1) CVTPS2PD(_0, _1) +#define cvtpd2ps(_0, _1) CVTPD2PS(_0, _1) + +#define VCVTSS2SD(_0, _1) INSTR_(vcvtss2sd, _0, _1) +#define VCVTSD2SS(_0, _1) INSTR_(vcvtsd2ss, _0, _1) +#define VCVTPS2PD(_0, _1) INSTR_(vcvtps2pd, _0, _1) +#define VCVTPD2PS(_0, _1) INSTR_(vcvtpd2ps, _0, _1) + +#define vcvtss2sd(_0, _1) VCVTSS2SD(_0, _1) +#define vcvtsd2ss(_0, _1) VCVTSD2SS(_0, _1) +#define vcvtps2pd(_0, _1) VCVTPS2PD(_0, _1) +#define vcvtpd2ps(_0, _1) VCVTPD2PS(_0, _1) + // Vector shuffles #define PSHUFD(_0, _1, _2) INSTR_(pshufd, _0, _1, _2) diff --git a/frame/include/blis.h b/frame/include/blis.h index 1599d3fbd..4a0b977b3 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -126,6 +126,7 @@ extern "C" { #include "bli_setri.h" #include "bli_castm.h" +#include "bli_castnzm.h" #include "bli_castv.h" #include "bli_projm.h" #include "bli_projv.h" diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h index 0f847bb6f..4cb41952a 100644 --- a/frame/include/level0/1m/bli_set1ms_mxn.h +++ b/frame/include/level0/1m/bli_set1ms_mxn.h @@ -37,6 +37,18 @@ // set1ms_mxn +#define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + /* Include real domain version to facilitate macro-izing mixed-datatype + components of packm. */ \ +} + +#define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + /* Include real domain version to facilitate macro-izing mixed-datatype + components of packm. */ \ +} + #define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ inc_t offm_local = offm; \ diff --git a/frame/include/level0/bli_adds_mxn.h b/frame/include/level0/bli_adds_mxn.h index 407380e8b..ab06fb362 100644 --- a/frame/include/level0/bli_adds_mxn.h +++ b/frame/include/level0/bli_adds_mxn.h @@ -41,62 +41,473 @@ // - The first char encodes the type of x. // - The second char encodes the type of y. -#define bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ + +// xy = ?s + +static void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ssadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_ssadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ssadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dsadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dsadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dsadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_csadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_csadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_csadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zsadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zsadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zsadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?d + +static void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sdadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sdadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sdadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ddadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_ddadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ddadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cdadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cdadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cdadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zdadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zdadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zdadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?c + +static void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_scadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_scadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_scadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dcadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dcadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dcadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ccadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_ccadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ccadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zcadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zcadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zcadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?z + +static void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_szadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dzadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_czadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzadds( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zzadds( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzadds( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_sadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ + +static void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_dadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_cadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_zadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } + #endif diff --git a/frame/include/level0/bli_copyjnzs.h b/frame/include/level0/bli_copyjnzs.h new file mode 100644 index 000000000..6bacdeb90 --- /dev/null +++ b/frame/include/level0/bli_copyjnzs.h @@ -0,0 +1,80 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJNZS_H +#define BLIS_COPYJNZS_H + +// copyjnzs + +// Notes: +// - The first char encodes the type of x. +// - The second char encodes the type of y. + +#define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) +#define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) +#define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) +#define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) + +#define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) + +// NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we +// don't touch the imaginary part of y. +#define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) +#define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) +#define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) +#define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) + +// NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we +// don't touch the imaginary part of y. +#define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) + + +#define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } + + +#define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) +#define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) +#define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) +#define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) +#define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) + + +#endif + diff --git a/frame/include/level0/bli_copynzs.h b/frame/include/level0/bli_copynzs.h new file mode 100644 index 000000000..860b80e1e --- /dev/null +++ b/frame/include/level0/bli_copynzs.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYNZS_H +#define BLIS_COPYNZS_H + +// copynzs + +// Notes: +// - The first char encodes the type of x. +// - The second char encodes the type of y. + +#define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) +#define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) +#define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) +#define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) + +#define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) +#define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) + +// NOTE: Use of scopyris() is so we don't touch the imaginary part of y. +#define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) +#define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) +#define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) +#define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) + +// NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. +#define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) +#define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) + + +#define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } + + +#define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) +#define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) +#define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) +#define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) +#define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) + + +#endif + diff --git a/frame/include/level0/bli_copys.h b/frame/include/level0/bli_copys.h index c5d7d9a41..6dbd047ac 100644 --- a/frame/include/level0/bli_copys.h +++ b/frame/include/level0/bli_copys.h @@ -51,32 +51,18 @@ #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) -#ifndef BLIS_ENABLE_C99_COMPLEX - +// NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) +// NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) -#else // ifdef BLIS_ENABLE_C99_COMPLEX - -#define bli_sccopys( x, y ) { (y) = (x); } -#define bli_dccopys( x, y ) { (y) = (x); } -#define bli_cccopys( x, y ) { (y) = (x); } -#define bli_zccopys( x, y ) { (y) = (x); } - -#define bli_szcopys( x, y ) { (y) = (x); } -#define bli_dzcopys( x, y ) { (y) = (x); } -#define bli_czcopys( x, y ) { (y) = (x); } -#define bli_zzcopys( x, y ) { (y) = (x); } - -#endif // BLIS_ENABLE_C99_COMPLEX - #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } diff --git a/frame/include/level0/bli_copys_mxn.h b/frame/include/level0/bli_copys_mxn.h index 650ebc95b..dc85756b3 100644 --- a/frame/include/level0/bli_copys_mxn.h +++ b/frame/include/level0/bli_copys_mxn.h @@ -41,62 +41,470 @@ // - The first char encodes the type of x. // - The second char encodes the type of y. -#define bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?s + +static void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sscopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sscopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sscopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dscopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dscopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dscopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cscopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cscopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cscopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zscopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zscopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zscopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?d + +static void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sdcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sdcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sdcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ddcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_ddcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_ddcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cdcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cdcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cdcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zdcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zdcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zdcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?c + +static void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sccopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sccopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sccopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dccopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dccopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dccopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cccopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cccopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cccopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zccopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zccopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zccopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ - *(y + _i*rs_y + _j*cs_y) ); \ +// xy = ?c + +static void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_szcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dzcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_czcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzcopys( *(x + ii + jj*cs_x), + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zzcopys( *(x + ii*rs_x + jj), + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzcopys( *(x + ii*rs_x + jj*cs_x), + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_scopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_dcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_ccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } -#define bli_zcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ -{ \ - bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ +static void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif diff --git a/frame/include/level0/bli_xpbys_mxn.h b/frame/include/level0/bli_xpbys_mxn.h index 93e69a7c1..89e62f379 100644 --- a/frame/include/level0/bli_xpbys_mxn.h +++ b/frame/include/level0/bli_xpbys_mxn.h @@ -42,106 +42,605 @@ // - The second char encodes the type of b. // - The third char encodes the type of y. -#define bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ - if ( bli_seq0( *beta ) ) \ - { \ - bli_sscopys_mxn( m, n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y ); \ - } \ - else \ - { \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ + +// xby = ?ss + +static void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict beta, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_seq0( *beta ) ) + { + bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sssxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sssxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict beta, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_seq0( *beta ) ) + { + bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dssxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dssxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict beta, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_seq0( *beta ) ) + { + bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cssxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cssxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict beta, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_seq0( *beta ) ) + { + bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zssxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zssxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ - if ( bli_deq0( *beta ) ) \ - { \ - bli_ddcopys_mxn( m, n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y ); \ - } \ - else \ - { \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ +// xby = ?dd + +static void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict beta, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_deq0( *beta ) ) + { + bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sddxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sddxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict beta, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_deq0( *beta ) ) + { + bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dddxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dddxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict beta, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_deq0( *beta ) ) + { + bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cddxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cddxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict beta, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_deq0( *beta ) ) + { + bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zddxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zddxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ - if ( bli_ceq0( *beta ) ) \ - { \ - bli_cccopys_mxn( m, n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y ); \ - } \ - else \ - { \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ +// xby = ?cc + +static void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict beta, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_ceq0( *beta ) ) + { + bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sccxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_sccxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict beta, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_ceq0( *beta ) ) + { + bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dccxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dccxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict beta, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_ceq0( *beta ) ) + { + bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cccxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_cccxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict beta, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_ceq0( *beta ) ) + { + bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zccxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zccxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{ \ - /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ - if ( bli_zeq0( *beta ) ) \ - { \ - bli_zzcopys_mxn( m, n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y ); \ - } \ - else \ - { \ - dim_t _i, _j; \ -\ - for ( _j = 0; _j < n; ++_j ) \ - for ( _i = 0; _i < m; ++_i ) \ - bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ - *(beta), \ - *(y + _i*rs_y + _j*cs_y) ); \ - } \ +// xby = ?zz + +static void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict beta, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_zeq0( *beta ) ) + { + bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szzxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_szzxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict beta, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_zeq0( *beta ) ) + { + bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict beta, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_zeq0( *beta ) ) + { + bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czzxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_czzxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } +} +static void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict beta, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + // If beta is zero, overwrite y with x (in case y has infs or NaNs). + if ( bli_zeq0( *beta ) ) + { + bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); + return; + } + +#ifdef BLIS_ENABLE_CR_CASES + if ( rs_x == 1 && rs_y == 1 ) + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, + *(y + ii + jj*cs_y) ); + } + else if ( cs_x == 1 && cs_y == 1 ) + { + for ( dim_t ii = 0; ii < m; ++ii ) + for ( dim_t jj = 0; jj < n; ++jj ) + bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, + *(y + ii*rs_y + jj) ); + } + else +#endif + { + for ( dim_t jj = 0; jj < n; ++jj ) + for ( dim_t ii = 0; ii < m; ++ii ) + bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, + *(y + ii*rs_y + jj*cs_y) ); + } } -#define bli_sxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ + +static void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, + float* restrict beta, + float* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } -#define bli_dxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ +static void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, + double* restrict beta, + double* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } -#define bli_cxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ +static void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + scomplex* restrict beta, + scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } -#define bli_zxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ -{\ - bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ +static void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, + dcomplex* restrict beta, + dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) +{ + bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } + #endif diff --git a/frame/include/level0/bli_cast.h b/frame/include/level0/old/bli_cast.h similarity index 100% rename from frame/include/level0/bli_cast.h rename to frame/include/level0/old/bli_cast.h diff --git a/frame/include/level0/ri/bli_copyjris.h b/frame/include/level0/ri/bli_copyjris.h index 6ca3ab432..910724bbd 100644 --- a/frame/include/level0/ri/bli_copyjris.h +++ b/frame/include/level0/ri/bli_copyjris.h @@ -42,5 +42,25 @@ #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) +#define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) +#define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) +#define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) +#define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) + +#define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) +#define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) +#define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) +#define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) + +#define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) +#define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) +#define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) +#define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) + +#define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) +#define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) +#define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) +#define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) + #endif diff --git a/frame/include/level0/ri/bli_copyris.h b/frame/include/level0/ri/bli_copyris.h index b4eef9363..fa7d6b0a6 100644 --- a/frame/include/level0/ri/bli_copyris.h +++ b/frame/include/level0/ri/bli_copyris.h @@ -59,5 +59,24 @@ (bi) = (ai); \ } -#endif +#define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) +#define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) +#define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) +#define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) +#define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) +#define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) +#define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) +#define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) + +#define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) +#define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) +#define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) +#define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) + +#define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) +#define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) +#define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) +#define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) + +#endif diff --git a/frame/ind/misc/bli_l3_ind_opt.h b/frame/ind/misc/bli_l3_ind_opt.h index 862428ae3..9d59a6cb6 100644 --- a/frame/ind/misc/bli_l3_ind_opt.h +++ b/frame/ind/misc/bli_l3_ind_opt.h @@ -49,7 +49,8 @@ \ /* If beta is in the real domain, and c is row- or column-stored, then we may proceed with the optimization. */ \ - if ( bli_obj_imag_equals( &beta, &BLIS_ZERO ) && \ + if ( /*bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&*/ \ + bli_obj_imag_is_zero( &beta ) && \ !bli_is_gen_stored( rs_c, cs_c ) ) \ { \ dt_exec = bli_dt_proj_to_real( dt_exec ); \ diff --git a/sandbox/ref99/blx_gemm_front.c b/sandbox/ref99/blx_gemm_front.c index d1d56eee8..841cf3153 100644 --- a/sandbox/ref99/blx_gemm_front.c +++ b/sandbox/ref99/blx_gemm_front.c @@ -97,19 +97,6 @@ void blx_gemm_front bli_obj_induce_trans( &c_local ); } - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_GEMM, - BLIS_LEFT, // ignored for gemm - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - { // A sort of hack for communicating the desired pach schemas for A and // B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and @@ -131,6 +118,19 @@ void blx_gemm_front } } + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + // Invoke the internal back-end via the thread handler. blx_gemm_thread ( diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c index 2a1cbe6b6..c780489e9 100644 --- a/sandbox/ref99/vars/blx_gemm_ker_var2.c +++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c @@ -254,6 +254,9 @@ void PASTECH2(blx_,ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + bli_auxinfo_set_dt_on_output( dt, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 3c2a52124..3dcd6d435 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -200,9 +200,9 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=40 \ +PDEF_ST := -DP_BEGIN=100 \ -DP_END=2000 \ - -DP_INC=40 + -DP_INC=100 PDEF_MT := -DP_BEGIN=200 \ -DP_END=10000 \ diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 64311753c..50aeb47ee 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -35,9 +35,6 @@ #include #include "blis.h" -void zgemm3m_( f77_char*, f77_char*, f77_int*, f77_int*, f77_int*, dcomplex*, dcomplex*, f77_int*, dcomplex*, f77_int*, dcomplex*, dcomplex*, f77_int* ); - - //#define PRINT int main( int argc, char** argv ) @@ -148,9 +145,6 @@ int main( int argc, char** argv ) bli_obj_create( dt, m, k, 0, 0, &a ); bli_obj_create( dt, k, n, 0, 0, &b ); bli_obj_create( dt, m, n, 0, 0, &c ); - //bli_obj_create( dt, m, k, 2, 2*m, &a ); - //bli_obj_create( dt, k, n, 2, 2*k, &b ); - //bli_obj_create( dt, m, n, 2, 2*m, &c ); bli_obj_create( dt, m, n, 0, 0, &c_save ); bli_randm( &a ); @@ -177,7 +171,6 @@ int main( int argc, char** argv ) { bli_copym( &c_save, &c ); - dtime = bli_clock(); diff --git a/test/mixeddt/Makefile b/test/mixeddt/Makefile new file mode 100644 index 000000000..cb9c3484e --- /dev/null +++ b/test/mixeddt/Makefile @@ -0,0 +1,401 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all all-st all-mt \ + blis blis-st blis-mt \ + blis-nat blis-nat-st blis-nat-mt \ + openblas openblas-st openblas-mt \ + mkl mkl-st mkl-mt \ + blis-gemm-st blis-gemm-mt \ + blis-gemm-nat-st blis-gemm-nat-mt \ + openblas-gemm-st openblas-gemm-mt \ + mkl-gemm-st mkl-gemm-mt \ + clean cleanx + + + +# +# --- Determine makefile fragment location ------------------------------------- +# + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +ifneq ($(strip $(BLIS_INSTALL_PATH)),) +LIB_PATH := $(BLIS_INSTALL_PATH)/lib +INC_PATH := $(BLIS_INSTALL_PATH)/include/blis +SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis +else +DIST_PATH := ../.. +LIB_PATH = ../../lib/$(CONFIG_NAME) +INC_PATH = ../../include/$(CONFIG_NAME) +SHARE_PATH := ../.. +endif + + + +# +# --- Include common makefile definitions -------------------------------------- +# + +# Include the common makefile fragment. +-include $(SHARE_PATH)/common.mk + + + +# +# --- BLAS and LAPACK implementations ------------------------------------------ +# + +# BLIS library and header path. This is simply wherever it was installed. +#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib +#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis + +# BLIS library. +#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a + +# BLAS library path(s). This is where the BLAS libraries reside. +HOME_LIB_PATH := $(HOME)/flame/lib +#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 + +# OpenBLAS +OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a +OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a + +# ATLAS +ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ + $(HOME_LIB_PATH)/libatlas.a + +# MKL +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_sequential \ + -lpthread -lm -ldl +#MKLP_LIB := -L$(MKL_LIB_PATH) \ +# -lmkl_intel_thread \ +# -lmkl_core \ +# -lmkl_intel_ilp64 \ +# -L$(ICC_LIB_PATH) \ +# -liomp5 +MKLP_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_gnu_thread \ + -lpthread -lm -ldl -fopenmp + #-L$(ICC_LIB_PATH) \ + #-lgomp + + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c))) + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-frame-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +CINCFLAGS := -I$(INC_PATH) + +# Use the "framework" CFLAGS for the configuration family. +CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) + +# Add local header paths to CFLAGS. +CFLAGS += -I$(TEST_SRC_PATH) + +# Locate the libblis library to which we will link. +LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) + + +# Datatypes for A, B, and C. +#DTA_S := -DDTA=BLIS_FLOAT +#DTA_D := -DDTA=BLIS_DOUBLE +#DTA_C := -DDTA=BLIS_SCOMPLEX +#DTA_Z := -DDTA=BLIS_DCOMPLEX +# +#DTB_S := -DDTB=BLIS_FLOAT +#DTB_D := -DDTB=BLIS_DOUBLE +#DTB_C := -DDTB=BLIS_SCOMPLEX +#DTB_Z := -DDTB=BLIS_DCOMPLEX +# +#DTC_S := -DDTC=BLIS_FLOAT +#DTC_D := -DDTC=BLIS_DOUBLE +#DTC_C := -DDTC=BLIS_SCOMPLEX +#DTC_Z := -DDTC=BLIS_DCOMPLEX +# +#DTX_S := -DDTC=BLIS_FLOAT +#DTX_D := -DDTC=BLIS_DOUBLE + +# Which library? +BLI_DEF := -DBLIS +BLA_DEF := -DBLAS + +# Implementation string +STR_BLI := -DSTR=\"asm_blis\" +STR_OBL := -DSTR=\"openblas\" +STR_MKL := -DSTR=\"mkl\" + +# Single or multithreaded string +STR_ST := -DTHR_STR=\"st\" +STR_MT := -DTHR_STR=\"mt\" + +# Problem size specification +PDEF_ST := -DP_BEGIN=96 \ + -DP_END=1200 \ + -DP_INC=96 + +PDEF_MT := -DP_BEGIN=80 \ + -DP_END=4000 \ + -DP_INC=80 + +# Enumerate possible datatypes and computation precisions. +dts := s d c z +prs := s d + +# Various functions that help us construct the datatype combinations and then +# extract the needed datatype strings and C preprocessor define flags. +get-char-c = $(word 1,$(subst _, ,$(1))) +get-char-a = $(word 2,$(subst _, ,$(1))) +get-char-b = $(word 3,$(subst _, ,$(1))) +get-char-x = $(word 4,$(subst _, ,$(1))) +get-cstr = $(call get-char-c,$(1))$(call get-char-a,$(1))$(call get-char-b,$(1))$(call get-char-x,$(1)) + +get-cdef-a = $(strip $(subst s,-DDTA=BLIS_FLOAT, \ + $(subst d,-DDTA=BLIS_DOUBLE, \ + $(subst c,-DDTA=BLIS_SCOMPLEX, \ + $(subst z,-DDTA=BLIS_DCOMPLEX,$(call get-char-a,$(1))))))) +get-cdef-b = $(strip $(subst s,-DDTB=BLIS_FLOAT, \ + $(subst d,-DDTB=BLIS_DOUBLE, \ + $(subst c,-DDTB=BLIS_SCOMPLEX, \ + $(subst z,-DDTB=BLIS_DCOMPLEX,$(call get-char-b,$(1))))))) +get-cdef-c = $(strip $(subst s,-DDTC=BLIS_FLOAT, \ + $(subst d,-DDTC=BLIS_DOUBLE, \ + $(subst c,-DDTC=BLIS_SCOMPLEX, \ + $(subst z,-DDTC=BLIS_DCOMPLEX,$(call get-char-c,$(1))))))) +get-cdef-x = $(strip $(subst s,-DDTX=BLIS_FLOAT, \ + $(subst d,-DDTX=BLIS_DOUBLE,$(call get-char-x,$(1))))) +get-cdefs = $(call get-cdef-c,$(1)) $(call get-cdef-a,$(1)) $(call get-cdef-b,$(1)) $(call get-cdef-x,$(1)) + +# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags. +get-idefs = $(strip $(subst asm_blis,-DSTR=\"$(1)\" -DBLIS, \ + $(subst openblas,-DSTR=\"$(1)\" -DBLAS, \ + $(subst mkl,-DSTR=\"$(1)\" -DBLAS,$(1))))) + +# Enumerate all possible datatype combinations. +DT_CODES := $(foreach dt0,$(dts),$(foreach dt1,$(dts),$(foreach dt2,$(dts),$(foreach pr,$(prs),$(dt0)_$(dt1)_$(dt2)_$(pr))))) + +# Build a list of the datatype strings. +DT_COMBOS := $(foreach code,$(DT_CODES),$(call get-cstr,$(code))) + +# Build a list of BLIS, OpenBLAS, and MKL executables. +BLIS_OBJS_ST := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_st.o) +BLIS_BINS_ST := $(patsubst %.o,%.x,$(BLIS_OBJS_ST)) +OPENBLAS_OBJS_ST := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_st.o) +OPENBLAS_BINS_ST := $(patsubst %.o,%.x,$(OPENBLAS_OBJS_ST)) + +BLIS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_mt.o) +BLIS_BINS_MT := $(patsubst %.o,%.x,$(BLIS_OBJS_MT)) +OPENBLAS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_mt.o) +OPENBLAS_BINS_MT := $(patsubst %.o,%.x,$(OPENBLAS_OBJS_MT)) + +#MKL_OBJS_ST := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_mkl_st.o) + +#BLIS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_asm_blis_mt.o) +#OPENBLAS_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_openblas_mt.o) +#MKL_OBJS_MT := $(foreach combo,$(DT_COMBOS),test_$(combo)gemm_mkl_mt.o) + + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +all: st + +st: blis-st openblas-st +mt: blis-mt openblas-mt + +blis-st: $(BLIS_BINS_ST) +openblas-st: $(OPENBLAS_BINS_ST) +blis-mt: $(BLIS_BINS_MT) +openblas-mt: $(OPENBLAS_BINS_MT) +#blis: test_ssssgemm_asm_blis_st.x \ +# test_sssdgemm_asm_blis_st.x \ +# test_ssdsgemm_asm_blis_st.x \ +# test_sdssgemm_asm_blis_st.x \ +# test_dsssgemm_asm_blis_st.x \ +# test_dddsgemm_asm_blis_st.x \ +# test_ddddgemm_asm_blis_st.x +#openblas: test_ssssgemm_openblas_st.x \ +# test_sssdgemm_openblas_st.x \ +# test_ssdsgemm_openblas_st.x \ +# test_sdssgemm_openblas_st.x \ +# test_dsssgemm_openblas_st.x \ +# test_dddsgemm_openblas_st.x \ +# test_ddddgemm_openblas_st.x + + +# --Object file rules -- + +# Define the function that will be used to instantiate compilation rules +# for the various implementations. +define make-st-rule +test_$(call get-cstr,$(1))gemm_$(2)_st.o: test_gemm.c Makefile +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(CFLAGS) $(PDEF_ST) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_ST) -c $$< -o $$@ +else + @echo "Compiling $$@" + @$(CC) $(CFLAGS) $(PDEF_ST) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_ST) -c $$< -o $$@ +endif +endef + +define make-mt-rule +test_$(call get-cstr,$(1))gemm_$(2)_mt.o: test_gemm.c Makefile +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(CFLAGS) $(PDEF_MT) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_MT) -c $$< -o $$@ +else + @echo "Compiling $$@" + @$(CC) $(CFLAGS) $(PDEF_MT) $(call get-cdefs,$(1)) $(call get-idefs,$(2)) $(STR_MT) -c $$< -o $$@ +endif +endef + + +# Define the implementations for which we will instantiate compilation rules. +IMPLS := asm_blis openblas + +# Instantiate the rule function make-st-rule() and make-mt-rule for each +# implementation in IMPLS and each of the datatype "codes" in DT_CODES. +$(foreach impl,$(IMPLS), \ +$(foreach code,$(DT_CODES),$(eval $(call make-st-rule,$(code),$(impl))))) + +$(foreach impl,$(IMPLS), \ +$(foreach code,$(DT_CODES),$(eval $(call make-mt-rule,$(code),$(impl))))) + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK) +ifeq ($(ENABLE_VERBOSE),yes) + $(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + $(RM_F) $< +else + @@echo "Linking $@ to '$(notdir $(OPENBLAS_LIB)) $(LIBBLIS_LINK)'" + @$(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + @$(RM_F) $< +endif + +test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK) +ifeq ($(ENABLE_VERBOSE),yes) + $(LINKER) $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + $(RM_F) $< +else + @@echo "Linking $@ to '$(notdir $(OPENBLAS_LIB)) $(LIBBLIS_LINK)'" + @$(LINKER) $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + @$(RM_F) $< +endif + +#test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK) +#ifeq ($(ENABLE_VERBOSE),yes) +# $(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ +# $(RM_F) $< +#else +# @@echo "Linking $@ to '$(notdir $(MKL_LIB)) $(LIBBLIS_LINK)'" +# @$(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ +# @$(RM_F) $< +#endif + +#test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK) +# $(LINKER) $< $(MKLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + +test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK) +ifeq ($(ENABLE_VERBOSE),yes) + $(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + $(RM_F) $< +else + @@echo "Linking $@ to '$(LIBBLIS_LINK)'" + @$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + @$(RM_F) $< +endif + +test_%_blis_mt.x: test_%_blis_mt.o $(LIBBLIS_LINK) +ifeq ($(ENABLE_VERBOSE),yes) + $(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + $(RM_F) $< +else + @@echo "Linking $@ to '$(LIBBLIS_LINK)'" + @$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@ + @$(RM_F) $< +endif + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.o *.x + +cleanout: + - $(RM_F) *.m + + diff --git a/test/mixeddt/matlab/gemm_md.pdf b/test/mixeddt/matlab/gemm_md.pdf new file mode 100644 index 000000000..e665aef46 Binary files /dev/null and b/test/mixeddt/matlab/gemm_md.pdf differ diff --git a/test/mixeddt/matlab/gen_dt_combos.m b/test/mixeddt/matlab/gen_dt_combos.m new file mode 100644 index 000000000..ee0fe8389 --- /dev/null +++ b/test/mixeddt/matlab/gen_dt_combos.m @@ -0,0 +1,165 @@ +function r_val = gen_dt_combos() + +dt_chars = [ 's' 'd' 'c' 'z' ]; +pr_chars = [ 's' 'd' ]; + +if 0 +i = 1; +for dtc = dt_chars + for dta = dt_chars + for dtb = dt_chars + for pr = pr_chars + dt_combos( i, : ) = sprintf( '%c%c%c%c', dtc, dta, dtb, pr ); + i = i + 1; + end + end + end +end +end + +%n_combos = size(temp,1); + +if 1 +dt_combos( 1, : ) = 'ssss'; +dt_combos( 2, : ) = 'ssds'; +dt_combos( 3, : ) = 'sscs'; +dt_combos( 4, : ) = 'sszs'; +dt_combos( 5, : ) = 'sdss'; +dt_combos( 6, : ) = 'sdds'; +dt_combos( 7, : ) = 'sdcs'; +dt_combos( 8, : ) = 'sdzs'; +dt_combos( 9, : ) = 'sssd'; +dt_combos( 10, : ) = 'ssdd'; +dt_combos( 11, : ) = 'sscd'; +dt_combos( 12, : ) = 'sszd'; +dt_combos( 13, : ) = 'sdsd'; +dt_combos( 14, : ) = 'sddd'; +dt_combos( 15, : ) = 'sdcd'; +dt_combos( 16, : ) = 'sdzd'; + +dt_combos( 17, : ) = 'scss'; +dt_combos( 18, : ) = 'scds'; +dt_combos( 19, : ) = 'sccs'; +dt_combos( 20, : ) = 'sczs'; +dt_combos( 21, : ) = 'szss'; +dt_combos( 22, : ) = 'szds'; +dt_combos( 23, : ) = 'szcs'; +dt_combos( 24, : ) = 'szzs'; +dt_combos( 25, : ) = 'scsd'; +dt_combos( 26, : ) = 'scdd'; +dt_combos( 27, : ) = 'sccd'; +dt_combos( 28, : ) = 'sczd'; +dt_combos( 29, : ) = 'szsd'; +dt_combos( 30, : ) = 'szdd'; +dt_combos( 31, : ) = 'szcd'; +dt_combos( 32, : ) = 'szzd'; + +dt_combos( 33, : ) = 'dsss'; +dt_combos( 34, : ) = 'dsds'; +dt_combos( 35, : ) = 'dscs'; +dt_combos( 36, : ) = 'dszs'; +dt_combos( 37, : ) = 'ddss'; +dt_combos( 38, : ) = 'ddds'; +dt_combos( 39, : ) = 'ddcs'; +dt_combos( 40, : ) = 'ddzs'; +dt_combos( 41, : ) = 'dssd'; +dt_combos( 42, : ) = 'dsdd'; +dt_combos( 43, : ) = 'dscd'; +dt_combos( 44, : ) = 'dszd'; +dt_combos( 45, : ) = 'ddsd'; +dt_combos( 46, : ) = 'dddd'; +dt_combos( 47, : ) = 'ddcd'; +dt_combos( 48, : ) = 'ddzd'; + +dt_combos( 49, : ) = 'dcss'; +dt_combos( 50, : ) = 'dcds'; +dt_combos( 51, : ) = 'dccs'; +dt_combos( 52, : ) = 'dczs'; +dt_combos( 53, : ) = 'dzss'; +dt_combos( 54, : ) = 'dzds'; +dt_combos( 55, : ) = 'dzcs'; +dt_combos( 56, : ) = 'dzzs'; +dt_combos( 57, : ) = 'dcsd'; +dt_combos( 58, : ) = 'dcdd'; +dt_combos( 59, : ) = 'dccd'; +dt_combos( 60, : ) = 'dczd'; +dt_combos( 61, : ) = 'dzsd'; +dt_combos( 62, : ) = 'dzdd'; +dt_combos( 63, : ) = 'dzcd'; +dt_combos( 64, : ) = 'dzzd'; + +dt_combos( 65, : ) = 'csss'; +dt_combos( 66, : ) = 'csds'; +dt_combos( 67, : ) = 'cscs'; +dt_combos( 68, : ) = 'cszs'; +dt_combos( 69, : ) = 'cdss'; +dt_combos( 70, : ) = 'cdds'; +dt_combos( 71, : ) = 'cdcs'; +dt_combos( 72, : ) = 'cdzs'; +dt_combos( 73, : ) = 'cssd'; +dt_combos( 74, : ) = 'csdd'; +dt_combos( 75, : ) = 'cscd'; +dt_combos( 76, : ) = 'cszd'; +dt_combos( 77, : ) = 'cdsd'; +dt_combos( 78, : ) = 'cddd'; +dt_combos( 79, : ) = 'cdcd'; +dt_combos( 80, : ) = 'cdzd'; + +dt_combos( 81, : ) = 'ccss'; +dt_combos( 82, : ) = 'ccds'; +dt_combos( 83, : ) = 'cccs'; +dt_combos( 84, : ) = 'cczs'; +dt_combos( 85, : ) = 'czss'; +dt_combos( 86, : ) = 'czds'; +dt_combos( 87, : ) = 'czcs'; +dt_combos( 88, : ) = 'czzs'; +dt_combos( 89, : ) = 'ccsd'; +dt_combos( 90, : ) = 'ccdd'; +dt_combos( 91, : ) = 'cccd'; +dt_combos( 92, : ) = 'cczd'; +dt_combos( 93, : ) = 'czsd'; +dt_combos( 94, : ) = 'czdd'; +dt_combos( 95, : ) = 'czcd'; +dt_combos( 96, : ) = 'czzd'; + +dt_combos( 97, : ) = 'zsss'; +dt_combos( 98, : ) = 'zsds'; +dt_combos( 99, : ) = 'zscs'; +dt_combos( 100, : ) = 'zszs'; +dt_combos( 101, : ) = 'zdss'; +dt_combos( 102, : ) = 'zdds'; +dt_combos( 103, : ) = 'zdcs'; +dt_combos( 104, : ) = 'zdzs'; +dt_combos( 105, : ) = 'zssd'; +dt_combos( 106, : ) = 'zsdd'; +dt_combos( 107, : ) = 'zscd'; +dt_combos( 108, : ) = 'zszd'; +dt_combos( 109, : ) = 'zdsd'; +dt_combos( 110, : ) = 'zddd'; +dt_combos( 111, : ) = 'zdcd'; +dt_combos( 112, : ) = 'zdzd'; + +dt_combos( 113, : ) = 'zcss'; +dt_combos( 114, : ) = 'zcds'; +dt_combos( 115, : ) = 'zccs'; +dt_combos( 116, : ) = 'zczs'; +dt_combos( 117, : ) = 'zzss'; +dt_combos( 118, : ) = 'zzds'; +dt_combos( 119, : ) = 'zzcs'; +dt_combos( 120, : ) = 'zzzs'; +dt_combos( 121, : ) = 'zcsd'; +dt_combos( 122, : ) = 'zcdd'; +dt_combos( 123, : ) = 'zccd'; +dt_combos( 124, : ) = 'zczd'; +dt_combos( 125, : ) = 'zzsd'; +dt_combos( 126, : ) = 'zzdd'; +dt_combos( 127, : ) = 'zzcd'; +dt_combos( 128, : ) = 'zzzd'; +end + + + + +r_val = dt_combos; + +end diff --git a/test/mixeddt/matlab/plot_all_md.m b/test/mixeddt/matlab/plot_all_md.m new file mode 100644 index 000000000..9302bdb0a --- /dev/null +++ b/test/mixeddt/matlab/plot_all_md.m @@ -0,0 +1,139 @@ +function r_val = plot_all_md( is_mt ) + +if is_mt == 1 + thr_str = 'mt'; +else + thr_str = 'st'; +end + +if 1 +dt_combos = gen_dt_combos(); +else +dt_combos( 1, : ) = [ 'ssss' ]; +dt_combos( 2, : ) = [ 'sssd' ]; +dt_combos( 3, : ) = [ 'ssds' ]; +dt_combos( 4, : ) = [ 'sdss' ]; +dt_combos( 5, : ) = [ 'dsss' ]; +dt_combos( 6, : ) = [ 'ddds' ]; +dt_combos( 7, : ) = [ 'dddd' ]; +end + +n_combos = size(dt_combos,1); + +filetemp_blis = '../output_%s_%sgemm_asm_blis.m'; +filetemp_open = '../output_%s_%sgemm_openblas.m'; + +% Construct filenames for the "reference" (single real) data, then load +% the data files, and finally save the results to different variable names. +file_blis_sref = sprintf( filetemp_blis, thr_str, 'ssss' ); +file_open_sref = sprintf( filetemp_open, thr_str, 'ssss' ); +%str = sprintf( ' Loading %s', file_blis_sref ); disp(str); +run( file_blis_sref ) +%str = sprintf( ' Loading %s', file_open_sref ); disp(str); +run( file_open_sref ) +data_gemm_asm_blis_sref( :, : ) = data_gemm_asm_blis( :, : ); +data_gemm_openblas_sref( :, : ) = data_gemm_openblas( :, : ); + +% Construct filenames for the "reference" (double real) data, then load +% the data files, and finally save the results to different variable names. +file_blis_dref = sprintf( filetemp_blis, thr_str, 'dddd' ); +file_open_dref = sprintf( filetemp_open, thr_str, 'dddd' ); +%str = sprintf( ' Loading %s', file_blis_dref ); disp(str); +run( file_blis_dref ) +%str = sprintf( ' Loading %s', file_open_dref ); disp(str); +run( file_open_dref ) +data_gemm_asm_blis_dref( :, : ) = data_gemm_asm_blis( :, : ); +data_gemm_openblas_dref( :, : ) = data_gemm_openblas( :, : ); + +% Construct filenames for the "reference" (single complex) data, then load +% the data files, and finally save the results to different variable names. +file_blis_cref = sprintf( filetemp_blis, thr_str, 'cccs' ); +file_open_cref = sprintf( filetemp_open, thr_str, 'cccs' ); +%str = sprintf( ' Loading %s', file_blis_cref ); disp(str); +run( file_blis_cref ) +%str = sprintf( ' Loading %s', file_open_cref ); disp(str); +run( file_open_cref ) +data_gemm_asm_blis_cref( :, : ) = data_gemm_asm_blis( :, : ); +data_gemm_openblas_cref( :, : ) = data_gemm_openblas( :, : ); + +% Construct filenames for the "reference" (double complex) data, then load +% the data files, and finally save the results to different variable names. +file_blis_zref = sprintf( filetemp_blis, thr_str, 'zzzd' ); +file_open_zref = sprintf( filetemp_open, thr_str, 'zzzd' ); +%str = sprintf( ' Loading %s', file_blis_zref ); disp(str); +run( file_blis_zref ) +%str = sprintf( ' Loading %s', file_open_zref ); disp(str); +run( file_open_zref ) +data_gemm_asm_blis_zref( :, : ) = data_gemm_asm_blis( :, : ); +data_gemm_openblas_zref( :, : ) = data_gemm_openblas( :, : ); + +fig = figure; +orient( fig, 'landscape' ); +set(gcf,'Position',[0 0 2000 900]); +set(gcf,'PaperUnits', 'inches'); +set(gcf,'PaperSize', [64 33]); +set(gcf,'PaperPosition', [0 0 64 33]); +%set(gcf,'PaperPositionMode','auto'); +set(gcf,'PaperPositionMode','manual'); +set(gcf,'PaperOrientation','landscape'); + +for dti = 1:n_combos +%for dti = 1:1 + + % Grab the current datatype combination. + combo = dt_combos( dti, : ); + + str = sprintf( 'Plotting %d: %s', dti, combo ); disp(str); + + if combo(4) == 's' + data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_sref( :, : ); + data_gemm_openblas_ref( :, : ) = data_gemm_openblas_sref( :, : ); + elseif combo(4) == 'd' + data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_dref( :, : ); + data_gemm_openblas_ref( :, : ) = data_gemm_openblas_dref( :, : ); + end + + if ( combo(1) == 'c' || combo(1) == 'z' ) && ... + ( combo(2) == 'c' || combo(2) == 'z' ) && ... + ( combo(3) == 'c' || combo(3) == 'z' ) + if combo(4) == 's' + data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_cref( :, : ); + data_gemm_openblas_ref( :, : ) = data_gemm_openblas_cref( :, : ); + elseif combo(4) == 'd' + data_gemm_asm_blis_ref( :, : ) = data_gemm_asm_blis_zref( :, : ); + data_gemm_openblas_ref( :, : ) = data_gemm_openblas_zref( :, : ); + end + end + + % Construct filenames for the data files from templates. + file_blis = sprintf( filetemp_blis, thr_str, combo ); + file_open = sprintf( filetemp_open, thr_str, combo ); + + % Load the data files. + %str = sprintf( ' Loading %s', file_blis ); disp(str); + run( file_blis ) + %str = sprintf( ' Loading %s', file_open ); disp(str); + run( file_open ) + + % Plot the result. + plot_gemm_perf( combo, ... + data_gemm_asm_blis, ... + data_gemm_asm_blis_ref, ... + data_gemm_openblas, ... + data_gemm_openblas_ref, ... + is_mt, dti ); + +end + + +if 0 +set(gcf,'Position',[0 0 2000 900]); +set(gcf,'PaperUnits', 'inches'); +set(gcf,'PaperSize', [48 22]); +set(gcf,'PaperPosition', [0 0 48 22]); +%set(gcf,'PaperPositionMode','auto'); +set(gcf,'PaperPositionMode','manual'); +set(gcf,'PaperOrientation','landscape'); +end +print(gcf, 'gemm_md','-bestfit','-dpdf'); +%print(gcf, 'gemm_md','-fillpage','-dpdf'); diff --git a/test/mixeddt/matlab/plot_gemm_perf.m b/test/mixeddt/matlab/plot_gemm_perf.m new file mode 100644 index 000000000..7fc9b0752 --- /dev/null +++ b/test/mixeddt/matlab/plot_gemm_perf.m @@ -0,0 +1,181 @@ +function r_val = plot_gemm_perf( dt_str, ... + data_blis, ... + data_blis_ref, ... + data_open, ... + data_open_ref, ... + is_mt, ... + theid ) + +if 1 +ax1 = subplot( 8, 16, theid ); +hold( ax1, 'on' ); +end + +color_blis_ref = 'b'; lines_blis_ref = ':'; markr_blis_ref = ''; +color_open_ref = 'k'; lines_open_ref = ':'; markr_open_ref = 'o'; +color_mkl_ref = 'r'; lines_mkl_ref = ':'; markr_mkl_ref = '.'; + +color_blis = 'b'; lines_blis = '-'; markr_blis = ''; +color_open = 'k'; lines_open = '-'; markr_open = 'o'; +color_mkl = 'r'; lines_mkl = '-'; markr_mkl = '.'; + +if dt_str(4) == 's' + flopspercycle = 32; +else + flopspercycle = 16; +end + +if is_mt == 1 + titlename = '%sgemm'; + yaxisname = 'GFLOPS/core'; + filename_pdf = 'fig_%sgemm_m1p_k1p_n1p_has_mt_perf.pdf'; + filename_png = 'fig_%sgemm_m1p_k1p_n1p_has_mt_perf.png'; + nth = 4; + x_end = 4000; + max_perf_core = (flopspercycle * 3.6) * 1; +else + titlename = '%sgemm'; + yaxisname = 'GFLOPS'; + filename_pdf = 'fig_%sgemm_m1p_k1p_n1p_has_st_perf.pdf'; + filename_png = 'fig_%sgemm_m1p_k1p_n1p_has_st_perf.png'; + nth = 1; + x_end = 2000; + max_perf_core = (flopspercycle * 3.6) * 1; +end + +titlename = sprintf( titlename, dt_str ); +filename_pdf = sprintf( filename_pdf, dt_str ); +filename_png = sprintf( filename_png, dt_str ); + +%dt0_str = [ dt_str(4), dt_str(4), dt_str(4), dt_str(4) ]; +dt0_str = dt_str(4); + +blis_sref_legend = sprintf( 'BLIS [sc]gemm' ); +blis_dref_legend = sprintf( 'BLIS [dz]gemm' ); +blis_legend = sprintf( 'BLIS mixed' ); +open_sref_legend = sprintf( 'OBLA [sc]gemm' ); +open_dref_legend = sprintf( 'OBLA [dz]gemm' ); +open_legend = sprintf( 'OBLA mixed' ); + +y_scale = 1.00; + +%xaxisname = 'problem size (m = n = k)'; +xaxisname = ' m = n = k'; + +colorflag = '-rgb'; + +x_begin = 0; + +y_begin = 0; +y_end = max_perf_core * y_scale; + +flopscol = 4; +msize = 5; +if 1 +fontsize = 12; +else +fontsize = 16; +end +linesize = 0.7; +legend_loc = 'SouthEast'; + +% -------------------------------------------------------------------- + +%fig = figure; +%hold on; ax1 = gca; + +x_axis( :, 1 ) = data_blis( :, 1 ); + +data_peak( 1, 1:2 ) = [ 0 max_perf_core ]; +data_peak( 2, 1:2 ) = [ x_end max_perf_core ]; + +blis_ref = line( x_axis( :, 1 ), data_blis_ref( :, flopscol ) / nth, ... + 'Color',color_blis_ref, 'LineStyle',lines_blis_ref, ... + 'LineWidth',linesize ); +blis_md = line( x_axis( :, 1 ), data_blis( :, flopscol ) / nth, ... + 'Color',color_blis, 'LineStyle',lines_blis, ... + 'LineWidth',linesize ); +open_ref = line( x_axis( :, 1 ), data_open_ref( :, flopscol ) / nth, ... + 'Color',color_open_ref, 'LineStyle',lines_open_ref, ... + 'LineWidth',linesize ); +open_md = line( x_axis( :, 1 ), data_open( :, flopscol ) / nth, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +%hold on; ax1 = gca; + %'Parent',ax1, ... + + +xlim( ax1, [x_begin x_end] ); +ylim( ax1, [y_begin y_end] ); + +if theid == 1 +leg = legend( ... +[ ... + blis_ref ... + blis_md ... + open_ref ... + open_md ... +], ... +blis_sref_legend, ... +blis_legend, ... +open_sref_legend, ... +open_legend, ... +'Location', 'best' ); +%'Location', legend_loc ); +set( leg,'Box','off' ); +set( leg,'Color','none' ); +set( leg,'FontSize',fontsize-2 ); +set( leg,'Units','inches' ); +elseif theid == 9 +leg = legend( ... +[ ... + blis_ref ... + blis_md ... + open_ref ... + open_md ... +], ... +blis_dref_legend, ... +blis_legend, ... +open_dref_legend, ... +open_legend, ... +'Location', 'best' ); +%'Location', legend_loc ); +set( leg,'Box','off' ); +set( leg,'Color','none' ); +set( leg,'FontSize',fontsize-2 ); +set( leg,'Units','inches' ); + +end + +set( ax1,'FontSize',fontsize ); +set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1. +box( ax1, 'on' ); + +titl = title( titlename ); +set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. + +tpos = get( titl, 'Position' ); % default is to align across whole figure, not box. +%tpos(1) = tpos(1) + 100; +tpos(1) = tpos(1) + 40; +set( titl, 'Position', tpos ); % here we nudge it back to centered with box. + +if theid > 112 +xlab = xlabel( ax1,xaxisname ); +%tpos = get( xlab, 'Position' ) +%tpos(2) = tpos(2) + 10; +%set( xlab, 'Position', tpos ); +end + +if mod(theid-1,16) == 0 +ylab = ylabel( ax1,yaxisname ); +end + + +%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' ); +%saveas( fig, filename_png ); + +%hold( ax1, 'off' ); + +r_val = 0; + +end diff --git a/test/mixeddt/matlab/testrand.m b/test/mixeddt/matlab/testrand.m new file mode 100644 index 000000000..07474711f --- /dev/null +++ b/test/mixeddt/matlab/testrand.m @@ -0,0 +1,44 @@ +fig1 = figure(1); +clf; + +%orient(fig1,'landscape') +orient(gcf,'landscape') + +for i = 1:128 + subplot(8,16,i); + xx = 400:400:2000; + aa = rand(size(xx)); + plot(xx,aa); +end + +% broken. +if 0 +set(gcf, 'PaperUnits', 'inches'); +set(gcf, 'PaperSize', [60 36]); +set(fig1,'PaperUnits','normalized'); +set(fig1,'PaperPosition', [0 0 1 1]); +print(fig1, 'testrand', '-dpdf'); +end + +if 0 +% works okay. +set(gcf,'PaperUnits', 'inches'); +set(gcf,'PaperSize', [72 36]); +set(gcf,'PaperPositionMode','auto'); +set(gcf,'PaperOrientation','landscape'); +set(gcf,'Position',[50 50 4000 1800]); +print(gcf, 'testrand','-bestfit','-dpdf'); +end + +if 1 +% works better? +set(gcf,'Position',[0 0 2000 900]); +set(gcf,'PaperUnits', 'inches'); +set(gcf,'PaperSize', [48 22]); +set(gcf,'PaperPosition', [0 0 48 22]); +%set(gcf,'PaperPositionMode','auto'); +set(gcf,'PaperPositionMode','manual'); +set(gcf,'PaperOrientation','landscape'); +print(gcf, 'testrand','-bestfit','-dpdf'); +end + diff --git a/test/mixeddt/matlab/testrand.pdf b/test/mixeddt/matlab/testrand.pdf new file mode 100644 index 000000000..b97c17528 Binary files /dev/null and b/test/mixeddt/matlab/testrand.pdf differ diff --git a/test/mixeddt/runme.sh b/test/mixeddt/runme.sh new file mode 100755 index 000000000..2e9967f2b --- /dev/null +++ b/test/mixeddt/runme.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# File pefixes. +exec_root="test" +out_root="output" + +sys="blis" +#sys="stampede2" +#sys="lonestar5" + +# Bind threads to processors. +#export OMP_PROC_BIND=true +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7" +#export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7" +#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45" +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" +export GOMP_CPU_AFFINITY="0 1 2 3" + +# Modify LD_LIBRARY_PATH. +if [ ${sys} = "blis" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH" + +elif [ ${sys} = "stampede2" ]; then + + : + +elif [ ${sys} = "lonestar5" ]; then + + # A hack to use libiomp5 with gcc. + #export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + : + +fi + +# Threading scheme to use when multithreading +if [ ${sys} = "blis" ]; then + + jc_nt=2 # 5th loop + ic_nt=2 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=4 + +elif [ ${sys} = "stampede2" ]; then + + jc_nt=2 # 5th loop + ic_nt=8 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=16 + +elif [ ${sys} = "lonestar5" ]; then + + jc_nt=4 # 5th loop + ic_nt=6 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=24 + +fi + +# Complex domain implementations to test. +if [ ${sys} = "blis" ]; then + + test_impls="openblas asm_blis" + +elif [ ${sys} = "stampede2" ]; then + + test_impls="openblas asm_blis mkl" + +elif [ ${sys} = "lonestar5" ]; then + + test_impls="openblas mkl asm_blis" +fi + +# Datatypes to test. +#dts="s d c z" + +# Operations to test. +l3_ops="gemm" +test_ops="${l3_ops}" + +# Define the list of datatype chars and precision chars. +dt_chars="s d c z" +pr_chars="s d" + +# Construct the datatype combination strings. +dt_combos="" +for dtc in ${dt_chars}; do + for dta in ${dt_chars}; do + for dtb in ${dt_chars}; do + for pre in ${pr_chars}; do + dt_combos="${dt_combos} ${dtc}${dta}${dtb}${pre}" + done + done + done +done + +# Threadedness to test. +threads="mt" +#threads="st" + +test_impls="openblas" + +#dt_combos="ssss sssd ssds sdss dsss ddds dddd" +#dt_combos="csss csds cdss cdds zsss zsds zdss zdds cssd csdd cdsd cddd zssd zsdd zdsd zddd" +#dt_combos="cssd csdd cdsd cddd zsss zsds zdss zdds" +#dt_combos="cdsd cddd zsss zsds zdss zdds" +#test_impls="asm_blis" + +# Now perform complex test cases. +for th in ${threads}; do + + for dt in ${dt_combos}; do + + for im in ${test_impls}; do + + for op in ${test_ops}; do + + # Set the number of threads according to th. + if [ ${th} = "mt" ]; then + + export BLIS_JC_NT=${jc_nt} + export BLIS_IC_NT=${ic_nt} + export BLIS_JR_NT=${jr_nt} + export BLIS_IR_NT=${ir_nt} + export OMP_NUM_THREADS=${nt} + export OPENBLAS_NUM_THREADS=${nt} + + # Unset GOMP_CPU_AFFINITY for OpenBLAS, as it causes the library + # to execute sequentially. + if [ ${im} = "openblas" ]; then + unset GOMP_CPU_AFFINITY + else + export GOMP_CPU_AFFINITY="0 1 2 3" + fi + else + + export BLIS_JC_NT=1 + export BLIS_IC_NT=1 + export BLIS_JR_NT=1 + export BLIS_IR_NT=1 + export OMP_NUM_THREADS=1 + export OPENBLAS_NUM_THREADS=1 + fi + + # Construct the name of the test executable. + exec_name="${exec_root}_${dt}${op}_${im}_${th}.x" + + # Construct the name of the output file. + out_file="${out_root}_${th}_${dt}${op}_${im}.m" + + echo "Running (nt = ${OMP_NUM_THREADS}) ./${exec_name} > ${out_file}" + + # Run executable. + ./${exec_name} > ${out_file} + + #sleep 1 + + done + done + done +done diff --git a/test/mixeddt/test_gemm.c b/test/mixeddt/test_gemm.c new file mode 100644 index 000000000..7be31960d --- /dev/null +++ b/test/mixeddt/test_gemm.c @@ -0,0 +1,580 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +void blas_gemm_md( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); +void blas_gemm( trans_t transa, trans_t transb, num_t dt, obj_t* ao, obj_t* alpha, obj_t* bo, obj_t* beta, obj_t* co ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t* alphao; + obj_t* betao; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input, k_input; + num_t dta, dtb, dtc, dtx; + char dta_ch, dtb_ch, dtc_ch; + char dtx_ch; + int r, n_repeats; + trans_t transa; + trans_t transb; + + double dtime; + double dtime_save; + double gflops; + double flopsmul; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dta = DTA; + dtb = DTB; + dtc = DTC; + dtx = DTX; + + // Extract the precision component of the computation datatype. + prec_t comp_prec = bli_dt_prec( dtx ); + + ( void )dta_ch; + ( void )dtb_ch; + ( void )dtc_ch; + ( void )dtx_ch; + + p_begin = P_BEGIN; + p_end = P_END; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + k_input = -1; + + +#if 0 + k_input = 256; +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dta ) ) dta_ch = 's'; + else if ( bli_is_double( dta ) ) dta_ch = 'd'; + else if ( bli_is_scomplex( dta ) ) dta_ch = 'c'; + else dta_ch = 'z'; + + if ( bli_is_float( dtb ) ) dtb_ch = 's'; + else if ( bli_is_double( dtb ) ) dtb_ch = 'd'; + else if ( bli_is_scomplex( dtb ) ) dtb_ch = 'c'; + else dtb_ch = 'z'; + + if ( bli_is_float( dtc ) ) dtc_ch = 's'; + else if ( bli_is_double( dtc ) ) dtc_ch = 'd'; + else if ( bli_is_scomplex( dtc ) ) dtc_ch = 'c'; + else dtc_ch = 'z'; + + if ( bli_is_float( dtx ) ) dtx_ch = 's'; + else dtx_ch = 'd'; + + transa = BLIS_NO_TRANSPOSE; + transb = BLIS_NO_TRANSPOSE; + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; + + //printf( "data_%s_%c%c%c%cgemm_%s", THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR ); + printf( "data_gemm_%s", STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + // Adjust the flops scaling based on which domain case is being executed. + if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + flopsmul = 2.0; + else if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) + flopsmul = 2.0; + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + flopsmul = 2.0; + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) +#ifdef BLIS + flopsmul = 4.0; +#else + flopsmul = 4.0; // executes 8.0, but only gets "credit" for 4.0 +#endif + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + flopsmul = 2.0; + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) +#ifdef BLIS + flopsmul = 4.0; +#else + flopsmul = 4.0; // executes 8.0, but only gets "credit" for 4.0 +#endif + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + flopsmul = 4.0; + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) + flopsmul = 8.0; + + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dta, m, k, 0, 0, &a ); + bli_obj_create( dtb, k, n, 0, 0, &b ); + bli_obj_create( dtc, m, n, 0, 0, &c ); + bli_obj_create( dtc, m, n, 0, 0, &c_save ); + + bli_obj_set_comp_prec( comp_prec, &c ); + + alphao = &BLIS_ONE; + betao = &BLIS_ONE; + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_copym( &c, &c_save ); + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_gemm + ( + alphao, + &a, + &b, + betao, + &c + ); + +#else + blas_gemm_md + ( + alphao, + &a, + &b, + betao, + &c + ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( flopsmul * m * k * n ) / ( dtime_save * 1.0e9 ); + + //printf( "data_%s_%c%c%c%cgemm_%s", THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR ); + printf( "data_gemm_%s", STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, gflops ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + +void blas_gemm_md( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ) +{ + trans_t transa = bli_obj_conjtrans_status( a ); + trans_t transb = bli_obj_conjtrans_status( b ); + + prec_t comp_prec = bli_obj_comp_prec( c ); + + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_dt( c ) == ( num_t )comp_prec ) + { + blas_gemm( transa, transb, bli_obj_dt( c ), alpha, a, b, beta, c ); + return; + } + + num_t dtc = bli_obj_dt( c ); + num_t dta = bli_obj_dt( a ); + num_t dtb = bli_obj_dt( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width_after_trans( a ); + + obj_t* ao = a; + obj_t* bo = b; + obj_t* co = c; + + num_t targ_dt_c, targ_dt_a, targ_dt_b; + dom_t targ_dom_c, targ_dom_a, targ_dom_b; + num_t dt_comp; + dom_t comp_dom; + obj_t at, bt, ct; + obj_t ar, cr; + bool_t needacc; + bool_t force_proj_a = FALSE; + bool_t force_proj_b = FALSE; + + + + if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + { + // rrr + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL; + needacc = FALSE; + } + else if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) + { + // rrc + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL; + needacc = FALSE; + force_proj_b = TRUE; + } + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + { + // rcr + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL; + needacc = FALSE; + force_proj_a = TRUE; + } + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) + { + // rcc + comp_dom = BLIS_COMPLEX; + targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX; + needacc = TRUE; + } + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + { + // crr + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL; + needacc = TRUE; + } + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) + { + // crc + comp_dom = BLIS_COMPLEX; + targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX; + needacc = FALSE; + force_proj_a = TRUE; + } + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + { + // ccr + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_REAL; + needacc = FALSE; + } + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) + { + // ccc + comp_dom = BLIS_COMPLEX; + targ_dom_c = BLIS_COMPLEX; targ_dom_a = BLIS_COMPLEX; targ_dom_b = BLIS_COMPLEX; + needacc = FALSE; + } + else + { + comp_dom = BLIS_REAL; + targ_dom_c = BLIS_REAL; targ_dom_a = BLIS_REAL; targ_dom_b = BLIS_REAL; + needacc = FALSE; + } + + // ---------------------------------------------------------------------------- + + + // Merge the computation domain with the computation precision. + dt_comp = comp_dom | comp_prec; + + targ_dt_a = targ_dom_a | comp_prec; + targ_dt_b = targ_dom_b | comp_prec; + targ_dt_c = targ_dom_c | comp_prec; + + // Copy-cast A, if needed. + if ( bli_dt_prec( dta ) != comp_prec || force_proj_a ) + { + bli_obj_create( targ_dt_a, m, k, 0, 0, &at ); + bli_castm( ao, &at ); + ao = &at; + } + + // Copy-cast B, if needed. + if ( bli_dt_prec( dtb ) != comp_prec || force_proj_b ) + { + bli_obj_create( targ_dt_b, k, n, 0, 0, &bt ); + bli_castm( bo, &bt ); + bo = &bt; + } + + if ( bli_dt_prec( dtc ) != comp_prec ) + { + needacc = TRUE; + } + + // Copy-cast C, if needed. + if ( needacc ) + { + //bli_obj_create( dt_comp, m, n, 0, 0, &ct ); + bli_obj_create( targ_dt_c, m, n, 0, 0, &ct ); + bli_castm( c, &ct ); + co = &ct; + } + + // ---------------------------------------------------------------------------- + + if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + { + } + else if ( bli_is_real( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) + { + } + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + { + } + else if ( bli_is_real( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) + { + } + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_real( dtb ) ) + { + } + else if ( bli_is_complex( dtc ) && bli_is_real( dta ) && bli_is_complex( dtb ) ) + { + } + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_real( dtb ) ) + { + inc_t rsa = bli_obj_row_stride( ao ); + inc_t csa = bli_obj_col_stride( ao ); + inc_t ma = bli_obj_length( ao ); + inc_t na = bli_obj_width( ao ); + siz_t ela = bli_obj_elem_size( ao ); + num_t dtap = bli_obj_dt_proj_to_real( ao ); + + bli_obj_alias_to( ao, &ar ); ao = &ar; + bli_obj_set_strides( rsa, 2*csa, ao ); + bli_obj_set_dims( 2*ma, na, ao ); + bli_obj_set_dt( dtap, ao ); + bli_obj_set_elem_size( ela/2, ao ); + + inc_t rsc = bli_obj_row_stride( co ); + inc_t csc = bli_obj_col_stride( co ); + inc_t mc = bli_obj_length( co ); + inc_t nc = bli_obj_width( co ); + siz_t elc = bli_obj_elem_size( co ); + num_t dtcp = bli_obj_dt_proj_to_real( co ); + + bli_obj_alias_to( co, &cr ); co = &cr; + bli_obj_set_strides( rsc, 2*csc, co ); + bli_obj_set_dims( 2*mc, nc, co ); + bli_obj_set_dt( dtcp, co ); + bli_obj_set_elem_size( elc/2, co ); + } + else if ( bli_is_complex( dtc ) && bli_is_complex( dta ) && bli_is_complex( dtb ) ) + { + } + else + { + } + + // ---------------------------------------------------------------------------- + + + // Call the BLAS. + blas_gemm( transa, transb, dt_comp, alpha, ao, bo, beta, co ); + + // Accumulate back to C, if needed. + if ( needacc ) + { + bli_castm( &ct, c ); + } + + + if ( bli_dt_prec( dta ) != comp_prec || force_proj_a ) { bli_obj_free( &at ); } + if ( bli_dt_prec( dtb ) != comp_prec || force_proj_b ) { bli_obj_free( &bt ); } + if ( needacc ) { bli_obj_free( &ct ); } +} + +void blas_gemm( trans_t transa, trans_t transb, num_t dt, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ) +{ + char f77_transa = 'N'; + char f77_transb = 'N'; + + //bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + //bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + float* alphap = bli_obj_buffer_for_1x1( dt, alpha ); + float* ap = bli_obj_buffer( a ); + float* bp = bli_obj_buffer( b ); + float* betap = bli_obj_buffer_for_1x1( dt, beta ); + float* cp = bli_obj_buffer( c ); + + sgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer_for_1x1( dt, alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer_for_1x1( dt, beta ); + double* cp = bli_obj_buffer( c ); + + dgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + scomplex* alphap = bli_obj_buffer_for_1x1( dt, alpha ); + scomplex* ap = bli_obj_buffer( a ); + scomplex* bp = bli_obj_buffer( b ); + scomplex* betap = bli_obj_buffer_for_1x1( dt, beta ); + scomplex* cp = bli_obj_buffer( c ); + + cgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + dcomplex* alphap = bli_obj_buffer_for_1x1( dt, alpha ); + dcomplex* ap = bli_obj_buffer( a ); + dcomplex* bp = bli_obj_buffer( b ); + dcomplex* betap = bli_obj_buffer_for_1x1( dt, beta ); + dcomplex* cp = bli_obj_buffer( c ); + + zgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +} + diff --git a/testsuite/input.general b/testsuite/input.general index 601941d83..772840224 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -25,6 +25,8 @@ cj # Vector storage scheme(s) to test: sdcz # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex +0 # Test gemm with mixed-domain operands? +0 # Test gemm with mixed-precision operands? 100 # Problem size: first to test 500 # Problem size: maximum to test 100 # Problem size: increment between experiments diff --git a/testsuite/input.general.fast b/testsuite/input.general.fast index f9de7d099..02b30b897 100644 --- a/testsuite/input.general.fast +++ b/testsuite/input.general.fast @@ -25,6 +25,8 @@ cj # Vector storage scheme(s) to test: sdcz # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex +0 # Test gemm with mixed-domain operands? +0 # Test gemm with mixed-precision operands? 100 # Problem size: first to test 100 # Problem size: maximum to test 100 # Problem size: increment between experiments diff --git a/testsuite/input.operations b/testsuite/input.operations index c3e6d6f16..f35e2cd9b 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -190,6 +190,10 @@ -1 -2 # dimensions: m n ? # parameters: transa +1 # xpbym +-1 -1 # dimensions: m n +? # parameters: transa + # --- Level-1f kernels ----------------------------------------------------- diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index bc261d7ae..a6b37864e 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -59,7 +59,7 @@ void libblis_test_addm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -139,7 +139,7 @@ void libblis_test_addm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,6 +150,8 @@ void libblis_test_addm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transx; @@ -158,6 +160,9 @@ void libblis_test_addm_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index eeaa38dd4..bb9a6c60d 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -59,7 +59,7 @@ void libblis_test_addv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -138,7 +138,7 @@ void libblis_test_addv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,6 +149,8 @@ void libblis_test_addv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -157,6 +159,9 @@ void libblis_test_addv_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c index 0aa5aaeb5..aa2865fad 100644 --- a/testsuite/src/test_amaxv.c +++ b/testsuite/src/test_amaxv.c @@ -59,7 +59,7 @@ void libblis_test_amaxv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -142,7 +142,7 @@ void libblis_test_amaxv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -156,12 +156,17 @@ void libblis_test_amaxv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; obj_t x; obj_t index; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c index 6a26d81d4..37c155eea 100644 --- a/testsuite/src/test_axpbyv.c +++ b/testsuite/src/test_axpbyv.c @@ -59,7 +59,7 @@ void libblis_test_axpbyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,7 +149,7 @@ void libblis_test_axpbyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -163,6 +163,8 @@ void libblis_test_axpbyv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -171,6 +173,9 @@ void libblis_test_axpbyv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 5a8dc32ae..1e3a610e8 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -59,7 +59,7 @@ void libblis_test_axpy2v_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,7 +149,7 @@ void libblis_test_axpy2v_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -163,6 +163,8 @@ void libblis_test_axpy2v_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx, conjy; @@ -172,9 +174,13 @@ void libblis_test_axpy2v_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index e37f8d4f7..cac2760d3 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -59,7 +59,7 @@ void libblis_test_axpyf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -147,7 +147,7 @@ void libblis_test_axpyf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -161,6 +161,8 @@ void libblis_test_axpyf_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, b_n; conj_t conja, conjx; @@ -170,9 +172,13 @@ void libblis_test_axpyf_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index e8c15a8e9..240bd4251 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -59,7 +59,7 @@ void libblis_test_axpym_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -144,7 +144,7 @@ void libblis_test_axpym_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -158,6 +158,8 @@ void libblis_test_axpym_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transx; @@ -166,6 +168,9 @@ void libblis_test_axpym_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index 1a048c7fa..60a1e3ece 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -59,7 +59,7 @@ void libblis_test_axpyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -144,7 +144,7 @@ void libblis_test_axpyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -158,6 +158,8 @@ void libblis_test_axpyv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -166,6 +168,9 @@ void libblis_test_axpyv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c index 2ea6ad7e8..9deebe92a 100644 --- a/testsuite/src/test_copym.c +++ b/testsuite/src/test_copym.c @@ -59,7 +59,7 @@ void libblis_test_copym_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -138,7 +138,7 @@ void libblis_test_copym_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,6 +149,8 @@ void libblis_test_copym_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transx; @@ -156,6 +158,9 @@ void libblis_test_copym_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c index 62892a793..8b3c3b7d4 100644 --- a/testsuite/src/test_copyv.c +++ b/testsuite/src/test_copyv.c @@ -59,7 +59,7 @@ void libblis_test_copyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -138,7 +138,7 @@ void libblis_test_copyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,6 +149,8 @@ void libblis_test_copyv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -156,6 +158,9 @@ void libblis_test_copyv_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index ca416f62f..7cb6000af 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -59,7 +59,7 @@ void libblis_test_dotaxpyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -151,7 +151,7 @@ void libblis_test_dotaxpyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -165,6 +165,8 @@ void libblis_test_dotaxpyv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjxt, conjx, conjy; @@ -175,9 +177,13 @@ void libblis_test_dotaxpyv_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index 3d4fe840c..6cf0d229a 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -59,7 +59,7 @@ void libblis_test_dotv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -140,7 +140,7 @@ void libblis_test_dotv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,6 +154,8 @@ void libblis_test_dotv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx, conjy, conjconjxy; @@ -161,6 +163,9 @@ void libblis_test_dotv_experiment obj_t x, y, rho; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index 3eb0363bf..f59497f28 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -59,7 +59,7 @@ void libblis_test_dotxaxpyf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -157,7 +157,7 @@ void libblis_test_dotxaxpyf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -171,6 +171,8 @@ void libblis_test_dotxaxpyf_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, b_n; conj_t conjat, conja, conjw, conjx; @@ -180,9 +182,13 @@ void libblis_test_dotxaxpyf_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index db92b37d7..12f44c260 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -59,7 +59,7 @@ void libblis_test_dotxf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,7 +149,7 @@ void libblis_test_dotxf_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -163,6 +163,8 @@ void libblis_test_dotxf_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, b_n; conj_t conjat, conjx; @@ -172,9 +174,13 @@ void libblis_test_dotxf_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index 5033c7504..774706414 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -59,7 +59,7 @@ void libblis_test_dotxv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -145,7 +145,7 @@ void libblis_test_dotxv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -159,6 +159,8 @@ void libblis_test_dotxv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx, conjy, conjconjxy; @@ -167,6 +169,9 @@ void libblis_test_dotxv_experiment obj_t rho_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index f3869d814..71ab97449 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -59,7 +59,20 @@ void libblis_test_gemm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_gemm_md + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -89,6 +102,24 @@ void libblis_test_gemm_check double* resid ); +void libblis_test_gemm_md_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ); + +double libblis_test_gemm_flops + ( + obj_t* a, + obj_t* b, + obj_t* c + ); void libblis_test_gemm_deps @@ -151,7 +182,7 @@ void libblis_test_gemm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -165,6 +196,8 @@ void libblis_test_gemm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n, k; trans_t transa; @@ -174,6 +207,18 @@ void libblis_test_gemm_experiment obj_t c_save; + // Use a different function to handle mixed datatypes. + if ( params->mixed_domain || params->mixed_precision ) + { + libblis_test_gemm_md( params, op, iface, + dc_str, pc_str, sc_str, + p_cur, perf, resid ); + return; + } + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); @@ -189,7 +234,7 @@ void libblis_test_gemm_experiment // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, transa, - sc_str[0], m, k, &a ); + sc_str[1], m, k, &a ); libblis_test_mobj_create( params, datatype, transb, sc_str[1], k, n, &b ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, @@ -249,6 +294,134 @@ void libblis_test_gemm_experiment } +void libblis_test_gemm_md + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t dt_a, dt_b, dt_c; + num_t dt_complex; + + dim_t m, n, k; + + trans_t transa; + trans_t transb; + + obj_t alpha, a, b, beta, c; + obj_t c_save; + + + // Decode the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &dt_c ); + bli_param_map_char_to_blis_dt( dc_str[1], &dt_a ); + bli_param_map_char_to_blis_dt( dc_str[2], &dt_b ); + + // Project one of the datatypes (it doesn't matter which) to the + // complex domain. + dt_complex = bli_dt_proj_to_complex( dt_c ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + k = libblis_test_get_dim_from_prob_size( op->dim_spec[2], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_trans( pc_str[0], &transa ); + bli_param_map_char_to_blis_trans( pc_str[1], &transb ); + + // Create test scalars. + bli_obj_scalar_init_detached( dt_complex, &alpha ); + bli_obj_scalar_init_detached( dt_complex, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, dt_a, transa, + sc_str[0], m, k, &a ); + libblis_test_mobj_create( params, dt_b, transb, + sc_str[1], k, n, &b ); + libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE, + sc_str[2], m, n, &c ); + libblis_test_mobj_create( params, dt_c, BLIS_NO_TRANSPOSE, + sc_str[2], m, n, &c_save ); + + // For mixed-precision, set the computation precision of C. + if ( params->mixed_precision ) + { + num_t dt_comp; + prec_t comp_prec; + + // The computation precision is encoded in the computation datatype, + // which appears as an additional char in dc_str. + bli_param_map_char_to_blis_dt( dc_str[3], &dt_comp ); + + // Extract the precision from the computation datatype. + comp_prec = bli_dt_prec( dt_comp ); + + // Set the computation precision of C. + bli_obj_set_comp_prec( comp_prec, &c ); + } + + + // Set alpha and beta. + { + bli_setsc( 2.0, 0.0, &alpha ); + bli_setsc( 1.2, 0.5, &beta ); + } + + // Randomize A, B, and C, and save C. + libblis_test_mobj_randomize( params, TRUE, &a ); + libblis_test_mobj_randomize( params, TRUE, &b ); + libblis_test_mobj_randomize( params, TRUE, &c ); + bli_copym( &c, &c_save ); + + // Apply the parameters. + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &c_save, &c ); + + time = bli_clock(); + + libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + //*perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF; + //if ( bli_obj_is_complex( &c ) ) *perf *= 4.0; + *perf = libblis_test_gemm_flops( &a, &b, &c ) / time_min / FLOPS_PER_UNIT_PERF; + + // Perform checks. + libblis_test_gemm_md_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); +} + + void libblis_test_gemm_impl ( @@ -273,6 +446,116 @@ void libblis_test_gemm_impl +void libblis_test_gemm_md_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ) +{ + num_t dt_real = bli_obj_dt_proj_to_real( c ); + num_t dt_comp = bli_obj_dt_proj_to_complex( c ); + num_t dt; + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width_after_trans( a ); + + obj_t norm; + obj_t t, v, w, z; + + double junk; + + // Compute our reference checksum in the real domain if all operands + // are real, and in the complex domain otherwise. Also implicit in this + // is that we use the storage precision of C to determine the precision + // in which we perform the reference checksum. + if ( bli_obj_is_real( a ) && + bli_obj_is_real( b ) && + bli_obj_is_real( c ) ) dt = dt_real; + else dt = dt_comp; + + // This function works in a manner similar to that of the function + // libblis_test_gemm_check(), except that we project a, b, and c into + // the complex domain (regardless of their storage datatype), and then + // proceed with the checking accordingly. + + obj_t a2, b2, c2, c0; + + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, n, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, k, 1, 0, 0, &w ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + + libblis_test_vobj_randomize( params, TRUE, &t ); + + // We need to zero out the imaginary part of t in order for our + // checks to work in all cases. Otherwise, the imaginary parts + // could affect intermediate products, depending on the order that + // they are executed. + bli_setiv( &BLIS_ZERO, &t ); + + // Create complex equivalents of a, b, c_orig, and c. + bli_obj_create( dt, m, k, 0, 0, &a2 ); + bli_obj_create( dt, k, n, 0, 0, &b2 ); + bli_obj_create( dt, m, n, 0, 0, &c2 ); + bli_obj_create( dt, m, n, 0, 0, &c0 ); + + // Cast a, b, c_orig, and c into the datatype of our temporary objects. + bli_castm( a, &a2 ); + bli_castm( b, &b2 ); + bli_castm( c_orig, &c2 ); + bli_castm( c, &c0 ); + + bli_gemv( &BLIS_ONE, &c0, &t, &BLIS_ZERO, &v ); + +#if 0 +if ( bli_obj_is_scomplex( c ) && + bli_obj_is_float( a ) && + bli_obj_is_float( b ) ) +{ +bli_printm( "test_gemm.c: a", a, "%7.3f", "" ); +bli_printm( "test_gemm.c: b", b, "%7.3f", "" ); +bli_printm( "test_gemm.c: c orig", c_orig, "%7.3f", "" ); +bli_printm( "test_gemm.c: c computed", c, "%7.3f", "" ); +} +#endif + +#if 0 + bli_gemm( alpha, &a2, &b2, beta, &c2 ); + bli_gemv( &BLIS_ONE, &c2, &t, &BLIS_ZERO, &z ); + if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z ); +#else + bli_gemv( &BLIS_ONE, &b2, &t, &BLIS_ZERO, &w ); + bli_gemv( alpha, &a2, &w, &BLIS_ZERO, &z ); + bli_gemv( beta, &c2, &t, &BLIS_ONE, &z ); + if ( bli_obj_is_real( c ) ) bli_setiv( &BLIS_ZERO, &z ); +#endif + + bli_subv( &z, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w ); + bli_obj_free( &z ); + + bli_obj_free( &a2 ); + bli_obj_free( &b2 ); + bli_obj_free( &c2 ); + bli_obj_free( &c0 ); +} + + + void libblis_test_gemm_check ( test_params_t* params, @@ -348,3 +631,43 @@ void libblis_test_gemm_check bli_obj_free( &z ); } +double libblis_test_gemm_flops + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + bool_t a_is_real = bli_obj_is_real( a ); + bool_t a_is_complex = bli_obj_is_complex( a ); + + bool_t b_is_real = bli_obj_is_real( b ); + bool_t b_is_complex = bli_obj_is_complex( b ); + + bool_t c_is_real = bli_obj_is_real( c ); + bool_t c_is_complex = bli_obj_is_complex( c ); + + double m = ( double )bli_obj_length( c ); + double n = ( double )bli_obj_width( c ); + double k = ( double )bli_obj_width( a ); + + double flops; + + if ( ( c_is_complex && a_is_complex && b_is_complex ) ) + { + flops = 8.0 * m * n * k; + } + else if ( ( c_is_complex && a_is_complex && b_is_real ) || + ( c_is_complex && a_is_real && b_is_complex ) || + ( c_is_real && a_is_complex && b_is_complex ) ) + { + flops = 4.0 * m * n * k; + } + else + { + flops = 2.0 * m * n * k; + } + + return flops; +} + diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 4fa0a2f27..f8fcb1224 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -59,7 +59,7 @@ void libblis_test_gemm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -152,7 +152,7 @@ void libblis_test_gemm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -166,6 +166,8 @@ void libblis_test_gemm_ukr_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n, k; inc_t ldap, ldbp; @@ -178,9 +180,13 @@ void libblis_test_gemm_ukr_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 6fead8c82..351991bb9 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -59,7 +59,7 @@ void libblis_test_gemmtrsm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -171,7 +171,7 @@ void libblis_test_gemmtrsm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -185,6 +185,8 @@ void libblis_test_gemmtrsm_ukr_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n, k; inc_t ldap, ldbp; @@ -203,9 +205,13 @@ void libblis_test_gemmtrsm_ukr_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index 4303cd11e..75a93395c 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -59,7 +59,7 @@ void libblis_test_gemv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,7 +148,7 @@ void libblis_test_gemv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -162,6 +162,8 @@ void libblis_test_gemv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transa; @@ -172,6 +174,9 @@ void libblis_test_gemv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index 89e215448..590374089 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -59,7 +59,7 @@ void libblis_test_ger_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -146,7 +146,7 @@ void libblis_test_ger_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -160,6 +160,8 @@ void libblis_test_ger_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; conj_t conjx, conjy; @@ -168,6 +170,9 @@ void libblis_test_ger_experiment obj_t a_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 00942bea9..b370dfd58 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -59,7 +59,7 @@ void libblis_test_hemm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,7 +154,7 @@ void libblis_test_hemm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -168,6 +168,8 @@ void libblis_test_hemm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; dim_t mn_side; @@ -180,6 +182,9 @@ void libblis_test_hemm_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index be985bd32..f940813f7 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -59,7 +59,7 @@ void libblis_test_hemv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,7 +149,7 @@ void libblis_test_hemv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -163,6 +163,8 @@ void libblis_test_hemv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -173,6 +175,9 @@ void libblis_test_hemv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index 49dadb1c1..534106a7b 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -59,7 +59,7 @@ void libblis_test_her_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -146,7 +146,7 @@ void libblis_test_her_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -160,6 +160,8 @@ void libblis_test_her_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -169,6 +171,9 @@ void libblis_test_her_experiment obj_t a_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index 040df7e68..e3d731ced 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -59,7 +59,7 @@ void libblis_test_her2_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,7 +148,7 @@ void libblis_test_her2_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -162,6 +162,8 @@ void libblis_test_her2_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -171,6 +173,9 @@ void libblis_test_her2_experiment obj_t a_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index bfc1fa900..b61bdf813 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -59,7 +59,7 @@ void libblis_test_her2k_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -152,7 +152,7 @@ void libblis_test_her2k_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -166,6 +166,8 @@ void libblis_test_her2k_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, k; uplo_t uploc; @@ -175,6 +177,9 @@ void libblis_test_her2k_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 3a68eee7a..f5d2c91f5 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -59,7 +59,7 @@ void libblis_test_herk_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,7 +150,7 @@ void libblis_test_herk_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -164,6 +164,8 @@ void libblis_test_herk_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, k; uplo_t uploc; @@ -173,6 +175,9 @@ void libblis_test_herk_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index d7f5825be..230b65820 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -50,6 +50,13 @@ char libblis_test_store_chars[ NUM_OPERAND_TYPES ][ MAX_STORE_VALS_PER_TYPE + 1 char libblis_test_param_chars[ NUM_PARAM_TYPES ][ MAX_PARAM_VALS_PER_TYPE + 1 ]; +char libblis_test_sp_chars[ 2 + 1 ] = "sc"; +char libblis_test_dp_chars[ 2 + 1 ] = "dz"; + +char libblis_test_rd_chars[ 2 + 1 ] = "sd"; +char libblis_test_cd_chars[ 2 + 1 ] = "cz"; + +char libblis_test_dt_chars[ 4 + 1 ] = "sdcz"; int main( int argc, char** argv ) @@ -239,6 +246,7 @@ void libblis_test_level1m_ops( thread_data_t* tdata, test_params_t* params, test libblis_test_scal2m( tdata, params, &(ops->scal2m) ); libblis_test_setm( tdata, params, &(ops->setm) ); libblis_test_subm( tdata, params, &(ops->subm) ); + libblis_test_xpbym( tdata, params, &(ops->xpbym) ); } @@ -348,6 +356,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->scal2m) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 0, &(ops->setm) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->subm) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->xpbym) ); // Level-1f libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 2, &(ops->axpy2v) ); @@ -482,14 +491,25 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params for( i = 0; i < params->n_datatypes; ++i ) { - if ( temp[i] == 's' ) params->datatype[i] = BLIS_FLOAT; - else if ( temp[i] == 'd' ) params->datatype[i] = BLIS_DOUBLE; - else if ( temp[i] == 'c' ) params->datatype[i] = BLIS_SCOMPLEX; - else if ( temp[i] == 'z' ) params->datatype[i] = BLIS_DCOMPLEX; + //if ( temp[i] == 's' ) params->datatype[i] = BLIS_FLOAT; + //else if ( temp[i] == 'd' ) params->datatype[i] = BLIS_DOUBLE; + //else if ( temp[i] == 'c' ) params->datatype[i] = BLIS_SCOMPLEX; + //else if ( temp[i] == 'z' ) params->datatype[i] = BLIS_DCOMPLEX; + + // Map the char in temp[i] to the corresponding num_t value. + bli_param_map_char_to_blis_dt( temp[i], &(params->datatype[i]) ); params->datatype_char[i] = temp[i]; } + // Read whether to test gemm with mixed-domain operands. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->mixed_domain) ); + + // Read whether to test gemm with mixed-precision operands. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->mixed_precision) ); + // Read the initial problem size to test. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->p_first) ); @@ -1073,6 +1093,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) for( i = 1; i < params->n_datatypes; ++i ) libblis_test_fprintf_c( os, " [%d] %d (%c)\n", i, params->datatype[i], params->datatype_char[i] ); + libblis_test_fprintf_c( os, "mix domains for gemm? %u\n", params->mixed_domain ); + libblis_test_fprintf_c( os, "mix precisions for gemm? %u\n", params->mixed_precision ); libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); @@ -1091,6 +1113,29 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "output to stdout AND files? %u\n", params->output_files ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf( os, "\n" ); + +#ifndef BLIS_ENABLE_GEMM_MD + // Notify the user if mixed domain or mixed precision was requested. + if ( params->mixed_domain || params->mixed_precision ) + { + libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" ); + } +#endif + + // If mixed domain or mixed precision was requested, we disable all + // induced methods. + if ( params->mixed_domain || params->mixed_precision ) + { + ind_t im; + + for ( im = BLIS_IND_FIRST; im < BLIS_IND_LAST+1; ++im ) + { + params->ind_enable[ im ] = 0; + } + + // Reenable native execution. + params->ind_enable[ BLIS_NAT ] = 1; + } } @@ -1354,23 +1399,26 @@ void carryover( unsigned int* c, -void libblis_test_op_driver( thread_data_t* tdata, - test_params_t* params, - test_op_t* op, - iface_t iface, - char* op_str, - char* p_types, - char* o_types, - thresh_t* thresh, - void (*f_exp) (test_params_t*, // params struct - test_op_t*, // op struct - iface_t, // iface - num_t, // datatype (current datatype) - char*, // pc_str (current param string) - char*, // sc_str (current storage string) - unsigned int, // p_cur (current problem size) - double*, // perf - double* ) ) // residual +void libblis_test_op_driver + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op, + iface_t iface, + char* op_str, + char* p_types, + char* o_types, + thresh_t* thresh, + void (*f_exp) (test_params_t*, // params struct + test_op_t*, // op struct + iface_t, // iface + char*, // datatype (current datatype) + char*, // pc_str (current param string) + char*, // sc_str (current storage string) + unsigned int, // p_cur (current problem size) + double*, // perf + double* ) // residual + ) { unsigned int n_mstorage = params->n_mstorage; unsigned int n_vstorage = params->n_vstorage; @@ -1379,6 +1427,8 @@ void libblis_test_op_driver( thread_data_t* tdata, unsigned int p_max = params->p_max; unsigned int p_inc = params->p_inc; unsigned int mix_all_storage = params->mix_all_storage; + unsigned int mixed_domain = params->mixed_domain; + unsigned int mixed_precision = params->mixed_precision; unsigned int reaction_to_failure = params->reaction_to_failure; num_t datatype; @@ -1392,12 +1442,28 @@ void libblis_test_op_driver( thread_data_t* tdata, char s_spec_str[ MAX_NUM_OPERANDS + 1 ]; unsigned int n_operands; + unsigned int n_operandsp1; char** chars_for_storage; unsigned int n_store_combos; char** sc_str; + char d_spec_str[ MAX_NUM_OPERANDS + 1 ]; + char** chars_for_spdt; + char** chars_for_dpdt; + unsigned int n_spdt_combos; + unsigned int n_dpdt_combos; + unsigned int n_dt_combos; + char** dc_str; + + char** chars_for_dt; + char** chars_for_rddt; + char** chars_for_cddt; + unsigned int n_rddt_combos; + unsigned int n_cddt_combos; + unsigned int p_cur, pi; - unsigned int dt, indi, pci, sci, i, j, o; + unsigned int indi, pci, sci, dci, i, j, o; + unsigned int is_mixed_dt; double perf, resid; char* pass_str; @@ -1411,6 +1477,13 @@ void libblis_test_op_driver( thread_data_t* tdata, FILE* output_stream = NULL; + // These arrays are malloc()'ed in select branches. Here, we set + // them to NULL so they can be unconditionally free()'ed at the + // end of the function. + chars_for_rddt = NULL; + chars_for_cddt = NULL; + chars_for_spdt = NULL; + chars_for_dpdt = NULL; // If output to files was requested, attempt to open a file stream. if ( params->output_files ) @@ -1447,11 +1520,8 @@ void libblis_test_op_driver( thread_data_t* tdata, // Compute the total number of parameter combinations to test (which is // simply the product of the string lengths of chars_for_param[i]. - for ( i = 0, n_param_combos = 1; i < n_params; ++i ) - { - if ( p_spec_str[i] == '?' ) - n_param_combos *= strlen( chars_for_param[i] ); - } + n_param_combos = libblis_test_count_combos( n_params, p_spec_str, + chars_for_param ); // Allocate an array of parameter combination strings, one for each // parameter combination that needs to be tested. @@ -1477,12 +1547,11 @@ void libblis_test_op_driver( thread_data_t* tdata, if ( iface == BLIS_TEST_SEQ_UKERNEL ) mix_all_storage = DISABLE; - // Determine the total number of storage schemes. + // Enumerate all combinations of storage schemes requested. if ( mix_all_storage ) { // Fill storage specification string with wildcard chars. - for ( i = 0; i < n_operands; ++i ) - s_spec_str[i] = '?'; + for ( i = 0; i < n_operands; ++i ) s_spec_str[i] = '?'; s_spec_str[i] = '\0'; // Allocate an array that stores pointers to the sets of possible @@ -1499,11 +1568,8 @@ void libblis_test_op_driver( thread_data_t* tdata, // Compute the total number of storage combinations to test (which is // simply the product of the string lengths of chars_for_storage[i]. - for ( i = 0, n_store_combos = 1; i < n_operands; ++i ) - { - if ( s_spec_str[i] == '?' ) - n_store_combos *= strlen( chars_for_storage[i] ); - } + n_store_combos = libblis_test_count_combos( n_operands, s_spec_str, + chars_for_storage ); // Allocate an array of storage combination strings, one for each // storage combination that needs to be tested. @@ -1581,6 +1647,300 @@ void libblis_test_op_driver( thread_data_t* tdata, } } + // Enumerate all combinations of datatype domains requested, but only + // for the gemm operation. + + if ( !mixed_domain && mixed_precision && op->opid == BLIS_GEMM ) + { + is_mixed_dt = TRUE; + + // Increment the number of operands by one to account for the + // computation precision (or computation datatype, as we will encode + // it in the char string). + n_operandsp1 = n_operands + 1; + + unsigned int has_rd = libblis_test_dt_str_has_rd_char( params ); + unsigned int has_cd = libblis_test_dt_str_has_cd_char( params ); + + // Fill datatype specification string with wildcard chars. + for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?'; + d_spec_str[i] = '\0'; + + // Allocate an array that stores pointers to the sets of possible + // datatype chars for each operand. + chars_for_rddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) ); + chars_for_cddt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) ); + + // Set the values in chars_for_rddt/cddt to the address of the string + // that holds the datatype chars. + for ( i = 0; i < n_operandsp1; ++i ) + { + chars_for_rddt[i] = libblis_test_rd_chars; + chars_for_cddt[i] = libblis_test_cd_chars; + } + + // Set the last set of chars in chars_for_cddt to the real domain + // charset. This is because the last char will be the computation + // precision. + //chars_for_cddt[i-1] = libblis_test_rd_chars; + + // Compute the total number of datatype combinations to test (which is + // simply the product of the string lengths of chars_for_spdt/dpdt[i]). + // NOTE: We skip inspecting/branching off of the d_spec_str chars since + // we know they are all '?'. + n_rddt_combos = 0; n_cddt_combos = 0; + + if ( has_rd ) + n_rddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str, + chars_for_rddt ); + + if ( has_cd ) + n_cddt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str, + chars_for_cddt ); + + // Add real and complex domain combinations. + n_dt_combos = n_rddt_combos + n_cddt_combos; + + // Allocate an array of datatype combination strings, one for each + // datatype combination that needs to be tested. + dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) ); + for ( dci = 0; dci < n_dt_combos; ++dci ) + dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) ); + + char** dc_str_p = dc_str; + + // Fill the datatype combination strings in dc_str with the datatype + // combinations implied by chars_for_rddt/cddt. + if ( has_rd ) + { + libblis_test_fill_param_strings( d_spec_str, + chars_for_rddt, + n_operandsp1, + n_rddt_combos, + dc_str_p ); + dc_str_p += n_rddt_combos; + } + if ( has_cd ) + { + libblis_test_fill_param_strings( d_spec_str, + chars_for_cddt, + n_operandsp1, + n_cddt_combos, + dc_str_p ); + dc_str_p += n_cddt_combos; + } + +#if 0 + printf( "n_rddt_combos = %d\n", n_rddt_combos ); + printf( "n_cddt_combos = %d\n", n_cddt_combos ); + printf( "n_dt_combos = %d\n\n", n_dt_combos ); + + for ( dci = 0; dci < n_dt_combos; ++dci ) + printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] ); + + bli_abort(); +#endif + } + else if ( mixed_domain && !mixed_precision && op->opid == BLIS_GEMM ) + { + is_mixed_dt = TRUE; + + // Increment the number of operands by one to account for the + // computation precision (or computation datatype, as we will encode + // it in the char string). + n_operandsp1 = n_operands + 1; + + unsigned int has_sp = libblis_test_dt_str_has_sp_char( params ); + unsigned int has_dp = libblis_test_dt_str_has_dp_char( params ); + + // Fill datatype specification string with wildcard chars. + for ( i = 0; i < n_operands; ++i ) d_spec_str[i] = '?'; + d_spec_str[i] = '\0'; + + // Allocate an array that stores pointers to the sets of possible + // datatype chars for each operand (plus the computation precision + // char). + chars_for_spdt = ( char** ) malloc( n_operands * sizeof( char* ) ); + chars_for_dpdt = ( char** ) malloc( n_operands * sizeof( char* ) ); + + // Set the values in chars_for_spdt/dpdt to the address of the string + // that holds the datatype chars. + for ( i = 0; i < n_operands; ++i ) + { + chars_for_spdt[i] = libblis_test_sp_chars; + chars_for_dpdt[i] = libblis_test_dp_chars; + } + + // Compute the total number of datatype combinations to test (which is + // simply the product of the string lengths of chars_for_spdt/dpdt[i]). + // NOTE: We skip inspecting/branching off of the d_spec_str chars since + // we know they are all '?'. + n_spdt_combos = 0; n_dpdt_combos = 0; + + if ( has_sp ) + n_spdt_combos = libblis_test_count_combos( n_operands, d_spec_str, + chars_for_spdt ); + + if ( has_dp ) + n_dpdt_combos = libblis_test_count_combos( n_operands, d_spec_str, + chars_for_dpdt ); + + // Add single- and double-precision combinations. + n_dt_combos = n_spdt_combos + n_dpdt_combos; + + // Allocate an array of datatype combination strings, one for each + // datatype combination that needs to be tested. + dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) ); + for ( dci = 0; dci < n_dt_combos; ++dci ) + dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) ); + + char** dc_str_p = dc_str; + + // Fill the datatype combination strings in dc_str with the datatype + // combinations implied by chars_for_spdt/dpdt. + if ( has_sp ) + { + libblis_test_fill_param_strings( d_spec_str, + chars_for_spdt, + n_operands, + n_spdt_combos, + dc_str_p ); + dc_str_p += n_spdt_combos; + } + if ( has_dp ) + { + libblis_test_fill_param_strings( d_spec_str, + chars_for_dpdt, + n_operands, + n_dpdt_combos, + dc_str_p ); + dc_str_p += n_dpdt_combos; + } + + // Manually set the computation char to the real projection of the + // first char of each combination. + for ( i = 0; i < n_dt_combos; ++i ) + { + dc_str[i][3] = libblis_test_proj_dtchar_to_precchar( dc_str[i][0] ); + dc_str[i][4] = '\0'; + } + +#if 0 + printf( "n_spdt_combos = %d\n", n_spdt_combos ); + printf( "n_dpdt_combos = %d\n", n_dpdt_combos ); + printf( "n_dt_combos = %d\n\n", n_dt_combos ); + + for ( dci = 0; dci < n_dt_combos; ++dci ) + printf( "dc_str[%2d] = %s\n", dci, dc_str[dci] ); + + bli_abort(); +#endif + } + else if ( mixed_domain && mixed_precision && op->opid == BLIS_GEMM ) + { + is_mixed_dt = TRUE; + + // Increment the number of operands by one to account for the + // computation precision (or computation datatype, as we will encode + // it in the char string). + n_operandsp1 = n_operands + 1; + + // Fill datatype specification string with wildcard chars. + for ( i = 0; i < n_operandsp1; ++i ) d_spec_str[i] = '?'; + d_spec_str[i] = '\0'; + + // Allocate an array that stores pointers to the sets of possible + // datatype chars for each operand. + chars_for_dt = ( char** ) malloc( n_operandsp1 * sizeof( char* ) ); + + // Set the values in chars_for_rddt/cddt to the address of the string + // that holds the datatype chars. + for ( i = 0; i < n_operandsp1; ++i ) + { + chars_for_dt[i] = libblis_test_dt_chars; + } + + // Set the last set of chars in chars_for_dt to the real domain + // charset. This is because the last char will be the computation + // precision, with the computation domain implied by the operands' + // storage datatypes. + chars_for_dt[i-1] = libblis_test_rd_chars; + + // Compute the total number of datatype combinations to test (which is + // simply the product of the string lengths of chars_for_dt[i]). + // NOTE: We skip inspecting/branching off of the d_spec_str chars since + // we know they are all '?'. + n_dt_combos = libblis_test_count_combos( n_operandsp1, d_spec_str, + chars_for_dt ); + + // Allocate an array of datatype combination strings, one for each + // datatype combination that needs to be tested. + dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) ); + for ( dci = 0; dci < n_dt_combos; ++dci ) + dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) ); + + // Fill the datatype combination strings in dc_str with the datatype + // combinations implied by chars_for_rddt/cddt. + libblis_test_fill_param_strings( d_spec_str, + chars_for_dt, + n_operandsp1, + n_dt_combos, + dc_str ); + +#if 0 + printf( "n_dt_combos = %d\n\n", n_dt_combos ); + + for ( dci = 0; dci < n_dt_combos; ++dci ) + printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] ); + + bli_abort(); +#endif + } + else // ( ( !mixed_domain && !mixed_precision ) || op->opid != BLIS_GEMM ) + { + is_mixed_dt = FALSE; + + // Increment the number of operands by one to account for the + // computation precision (or computation datatype, as we will encode + // it in the char string). + n_operandsp1 = n_operands + 1; + + // Since we are not mixing domains, we only consider n_datatype + // datatype combinations, where each combination is actually + // homogeneous (e.g. "sss", "ddd", etc., if n_operands == 3). + n_dt_combos = n_datatypes; + + // Allocate an array of datatype combination strings, one for each + // datatype specified. + dc_str = ( char** ) malloc( n_dt_combos * sizeof( char* ) ); + for ( dci = 0; dci < n_dt_combos; ++dci ) + dc_str[dci] = ( char* ) malloc( ( n_operandsp1 + 1 ) * sizeof( char ) ); + + // Fill each datatype combination string with the same dt char for + // each operand in the current operation. + for ( dci = 0; dci < n_dt_combos; ++dci ) + { + dt_char = params->datatype_char[dci]; + + for ( i = 0; i < n_operands; ++i ) + dc_str[dci][i] = dt_char; + + // Encode the computation precision as the last char. + dc_str[dci][i] = libblis_test_proj_dtchar_to_precchar( dc_str[dci][0] ); + + dc_str[dci][i+1] = '\0'; + } + +#if 0 + printf( "n_dt_combos = %d\n\n", n_dt_combos ); + + for ( dci = 0; dci < n_dt_combos; ++dci ) + printf( "dc_str[%3d] = %s\n", dci, dc_str[dci] ); + + bli_abort(); +#endif + } + // These statements should only be executed by one thread. @@ -1611,10 +1971,27 @@ void libblis_test_op_driver( thread_data_t* tdata, for ( sci = 0; sci < n_store_combos; ++sci ) { // Loop over the requested datatypes. - for ( dt = 0; dt < n_datatypes; ++dt ) + for ( dci = 0; dci < n_dt_combos; ++dci ) + //for ( dci = 14; dci < 15; ++dci ) + //for ( dci = 12; dci < 13; ++dci ) + //for ( dci = 4; dci < 5; ++dci ) + //for ( dci = 8; dci < 9; ++dci ) + //for ( dci = 0; dci < 1; ++dci ) { - datatype = params->datatype[dt]; - dt_char = params->datatype_char[dt]; + // We need a datatype to use for induced method related things + // as well as to decide which set of residual thresholds to use. + // We must choose the first operand's dt char since that's the + // only operand we know is guaranteed to exist. + bli_param_map_char_to_blis_dt( dc_str[dci][0], &datatype ); + + // If any of the operands are single precision, ensure that + // datatype is also single precision. + int has_sp = libblis_test_dt_str_has_sp_char_str( n_operandsp1, + dc_str[dci] ); + if ( has_sp ) + { + datatype = bli_dt_proj_to_single_prec( datatype ); + } // Build a commented column label string. libblis_test_build_col_labels_string( params, op, label_str ); @@ -1680,7 +2057,7 @@ void libblis_test_op_driver( thread_data_t* tdata, f_exp( params, op, iface, - datatype, + dc_str[dci], pc_str[pci], sc_str[sci], p_cur, @@ -1692,18 +2069,22 @@ void libblis_test_op_driver( thread_data_t* tdata, // Query the string corresponding to the residual's // position relative to the thresholds. + // NOTE: Passing in datatype (ie: the value associated + // with dc_str[dci][0]) will work, but just barely, since + // the numerical thresholds within precisions should be + // the same. pass_str = libblis_test_get_string_for_result( resid, datatype, thresh ); - // Build a string unique to the operation, datatype, - // parameter combination, and storage combination being - // tested. + // Build a string unique to the operation, datatype combo, + // parameter combo, and storage combo being tested. libblis_test_build_function_string( BLIS_FILEDATA_PREFIX_STR, indi, ind_str, op_str, - dt_char, + is_mixed_dt, + dc_str[dci], n_param_combos, pc_str[pci], sc_str[sci], @@ -1812,6 +2193,18 @@ void libblis_test_op_driver( thread_data_t* tdata, free( sc_str[sci] ); free( sc_str ); + // Free some auxiliary arrays used by the mixed-domain/mixed-precision + // datatype-handling logic. + free( chars_for_rddt ); + free( chars_for_cddt ); + free( chars_for_spdt ); + free( chars_for_dpdt ); + + // Free the datatype combination strings and then the master pointer. + for ( dci = 0; dci < n_dt_combos; ++dci ) + free( dc_str[dci] ); + free( dc_str ); + // If the file was opened (successfully), close the output stream. if ( output_stream ) @@ -1824,17 +2217,27 @@ void libblis_test_op_driver( thread_data_t* tdata, -void libblis_test_build_function_string( char* prefix_str, - ind_t method, - char* ind_str, - char* op_str, - char dt_char, - unsigned int n_param_combos, - char* pc_str, - char* sc_str, - char* funcname_str ) +void libblis_test_build_function_string + ( + char* prefix_str, + ind_t method, + char* ind_str, + char* op_str, + unsigned int is_mixed_dt, + char* dc_str, + unsigned int n_param_combos, + char* pc_str, + char* sc_str, + char* funcname_str + ) { - sprintf( funcname_str, "%s_%c%s", prefix_str, dt_char, op_str ); + // We only print the full datatype combination string if is_mixed_dt + // is set and native execution is begin used. Otherwise, we print only + // the first char (since they are all the same). + if ( is_mixed_dt == TRUE && method == BLIS_NAT ) + sprintf( funcname_str, "%s_%s%s", prefix_str, dc_str, op_str ); + else + sprintf( funcname_str, "%s_%c%s", prefix_str, dc_str[0], op_str ); // If the method is non-native (ie: induced), append a string // identifying the induced method. @@ -2662,3 +3065,99 @@ int libblis_test_l3_is_disabled( test_op_t* op ) if ( op->ops->l3_over == DISABLE ) return TRUE; else return FALSE; } + +// --- + +int libblis_test_dt_str_has_sp_char( test_params_t* params ) +{ + return libblis_test_dt_str_has_sp_char_str( params->n_datatypes, + params->datatype_char ); +} + +int libblis_test_dt_str_has_sp_char_str( int n, char* str ) +{ + for ( int i = 0; i < n; ++i ) + { + if ( str[i] == 's' || + str[i] == 'c' ) return TRUE; + } + + return FALSE; +} + +// --- + +int libblis_test_dt_str_has_dp_char( test_params_t* params ) +{ + return libblis_test_dt_str_has_dp_char_str( params->n_datatypes, + params->datatype_char ); +} + +int libblis_test_dt_str_has_dp_char_str( int n, char* str ) +{ + for ( int i = 0; i < n; ++i ) + { + if ( str[i] == 'd' || + str[i] == 'z' ) return TRUE; + } + + return FALSE; +} + +// --- + +int libblis_test_dt_str_has_rd_char( test_params_t* params ) +{ + int i; + + for ( i = 0; i < params->n_datatypes; ++i ) + { + if ( params->datatype_char[i] == 's' || + params->datatype_char[i] == 'd' ) return TRUE; + } + + return FALSE; +} + +int libblis_test_dt_str_has_cd_char( test_params_t* params ) +{ + int i; + + for ( i = 0; i < params->n_datatypes; ++i ) + { + if ( params->datatype_char[i] == 'c' || + params->datatype_char[i] == 'z' ) return TRUE; + } + + return FALSE; +} + +unsigned int libblis_test_count_combos + ( + unsigned int n_operands, + char* spec_str, + char** char_sets + ) +{ + unsigned int n_combos = 1; + int i; + + for ( i = 0; i < n_operands; ++i ) + { + if ( spec_str[i] == '?' ) + n_combos *= strlen( char_sets[i] ); + } + + return n_combos; +} + +char libblis_test_proj_dtchar_to_precchar( char dt_char ) +{ + char r_val = dt_char; + + if ( r_val == 'c' ) r_val = 's'; + else if ( r_val == 'z' ) r_val = 'd'; + + return r_val; +} + diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 5b2f2c2e5..020f23549 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -176,6 +176,8 @@ typedef struct unsigned int n_datatypes; char datatype_char[ MAX_NUM_DATATYPES + 1 ]; num_t datatype[ MAX_NUM_DATATYPES + 1 ]; + unsigned int mixed_domain; + unsigned int mixed_precision; unsigned int p_first; unsigned int p_max; unsigned int p_inc; @@ -251,6 +253,7 @@ typedef struct test_ops_s test_op_t scal2m; test_op_t setm; test_op_t subm; + test_op_t xpbym; // level-1f test_op_t axpy2v; @@ -369,35 +372,42 @@ void carryover( unsigned int* c, // --- Operation driver --- -void libblis_test_op_driver( thread_data_t* tdata, - test_params_t* params, - test_op_t* op, - iface_t iface, - char* op_str, - char* p_types, - char* o_types, - thresh_t* thresh, - void (*f_exp) (test_params_t*, // params struct - test_op_t*, // op struct - iface_t, // iface - num_t, // datatype (current datatype) - char*, // pc_str (current param string) - char*, // sc_str (current storage string) - unsigned int, // p_cur (current problem size) - double*, // perf - double* ) ); // residual +void libblis_test_op_driver + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op, + iface_t iface, + char* op_str, + char* p_types, + char* o_types, + thresh_t* thresh, + void (*f_exp) (test_params_t*, // params struct + test_op_t*, // op struct + iface_t, // iface + char*, // dc_str (current datatype string) + char*, // pc_str (current param string) + char*, // sc_str (current storage string) + unsigned int, // p_cur (current problem size) + double*, // perf + double*) // residual + ); // --- Generate experiment string labels --- -void libblis_test_build_function_string( char* prefix_str, - ind_t method, - char* ind_str, - char* op_str, - char dt_char, - unsigned int n_param_combos, - char* pc_str, - char* sc_str, - char* func_str ); +void libblis_test_build_function_string + ( + char* prefix_str, + ind_t method, + char* ind_str, + char* op_str, + unsigned int is_mixed_dt, + char* dc_str, + unsigned int n_param_combos, + char* pc_str, + char* sc_str, + char* funcname_str + ); void libblis_test_build_dims_string( test_op_t* op, dim_t p_cur, @@ -465,6 +475,21 @@ int libblis_test_l1f_is_disabled( test_op_t* op ); int libblis_test_l2_is_disabled( test_op_t* op ); int libblis_test_l3ukr_is_disabled( test_op_t* op ); int libblis_test_l3_is_disabled( test_op_t* op ); +int libblis_test_dt_str_has_sp_char( test_params_t* params ); +int libblis_test_dt_str_has_sp_char_str( int n, char* str ); +int libblis_test_dt_str_has_dp_char( test_params_t* params ); +int libblis_test_dt_str_has_dp_char_str( int n, char* str ); +int libblis_test_dt_str_has_rd_char( test_params_t* params ); +int libblis_test_dt_str_has_cd_char( test_params_t* params ); + +unsigned int libblis_test_count_combos + ( + unsigned int n_operands, + char* spec_str, + char** char_sets + ); +char libblis_test_proj_dtchar_to_precchar( char dt_char ); + // // --- Test module headers ----------------------------------------------------- @@ -498,6 +523,7 @@ int libblis_test_l3_is_disabled( test_op_t* op ); #include "test_scal2m.h" #include "test_setm.h" #include "test_subm.h" +#include "test_xpbym.h" // Level-1f kernels #include "test_axpy2v.h" diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c index 41ecccb7f..7ae052f21 100644 --- a/testsuite/src/test_normfm.c +++ b/testsuite/src/test_normfm.c @@ -59,7 +59,7 @@ void libblis_test_normfm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -137,7 +137,7 @@ void libblis_test_normfm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,17 +148,24 @@ void libblis_test_normfm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - num_t dt_real = bli_dt_proj_to_real( datatype ); - double time_min = DBL_MAX; double time; + num_t datatype; + num_t dt_real; + dim_t m, n; obj_t beta, norm; obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Compute the real projection of the chosen datatype. + dt_real = bli_dt_proj_to_real( datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c index 791fa9fc5..ac83481ed 100644 --- a/testsuite/src/test_normfv.c +++ b/testsuite/src/test_normfv.c @@ -59,7 +59,7 @@ void libblis_test_normfv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -137,7 +137,7 @@ void libblis_test_normfv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,17 +148,24 @@ void libblis_test_normfv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - num_t dt_real = bli_dt_proj_to_real( datatype ); - double time_min = DBL_MAX; double time; + num_t datatype; + num_t dt_real; + dim_t m; obj_t beta, norm; obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Compute the real projection of the chosen datatype. + dt_real = bli_dt_proj_to_real( datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c index 830440c45..37b437d42 100644 --- a/testsuite/src/test_randm.c +++ b/testsuite/src/test_randm.c @@ -59,7 +59,7 @@ void libblis_test_randm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -134,7 +134,7 @@ void libblis_test_randm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t dt, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,6 +148,8 @@ void libblis_test_randm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; char x_store; @@ -155,6 +157,9 @@ void libblis_test_randm_experiment obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); @@ -165,7 +170,7 @@ void libblis_test_randm_experiment x_store = sc_str[0]; // Create the test objects. - libblis_test_mobj_create( params, dt, BLIS_NO_TRANSPOSE, x_store, m, n, &x ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, x_store, m, n, &x ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c index 0cccb0b5e..e1bf28fb9 100644 --- a/testsuite/src/test_randv.c +++ b/testsuite/src/test_randv.c @@ -59,7 +59,7 @@ void libblis_test_randv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -134,7 +134,7 @@ void libblis_test_randv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,6 +148,8 @@ void libblis_test_randv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; char x_store; @@ -155,6 +157,9 @@ void libblis_test_randv_experiment obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index 13af3791c..9814326af 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -59,7 +59,7 @@ void libblis_test_scal2m_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -143,7 +143,7 @@ void libblis_test_scal2m_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -157,6 +157,8 @@ void libblis_test_scal2m_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transx; @@ -165,6 +167,9 @@ void libblis_test_scal2m_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index d2909f937..765bd92ec 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -59,7 +59,7 @@ void libblis_test_scal2v_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -143,7 +143,7 @@ void libblis_test_scal2v_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -157,6 +157,8 @@ void libblis_test_scal2v_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -165,6 +167,9 @@ void libblis_test_scal2v_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 93130a6d8..adeefacc1 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -59,7 +59,7 @@ void libblis_test_scalm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -139,7 +139,7 @@ void libblis_test_scalm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -153,6 +153,8 @@ void libblis_test_scalm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; conj_t conjbeta; @@ -161,6 +163,9 @@ void libblis_test_scalm_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index d3bd9450e..e276d2a6b 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -59,7 +59,7 @@ void libblis_test_scalv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -140,7 +140,7 @@ void libblis_test_scalv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,6 +154,8 @@ void libblis_test_scalv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjbeta; @@ -162,6 +164,9 @@ void libblis_test_scalv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c index e2d6336dd..dbedcdea4 100644 --- a/testsuite/src/test_setm.c +++ b/testsuite/src/test_setm.c @@ -59,7 +59,7 @@ void libblis_test_setm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -136,7 +136,7 @@ void libblis_test_setm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,12 +150,17 @@ void libblis_test_setm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; obj_t beta; obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c index d4f8143f4..f1984f6be 100644 --- a/testsuite/src/test_setv.c +++ b/testsuite/src/test_setv.c @@ -59,7 +59,7 @@ void libblis_test_setv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -136,7 +136,7 @@ void libblis_test_setv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,12 +150,17 @@ void libblis_test_setv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; obj_t beta; obj_t x; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index e6d84a5b3..287f299b1 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -59,7 +59,7 @@ void libblis_test_subm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -139,7 +139,7 @@ void libblis_test_subm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,6 +150,8 @@ void libblis_test_subm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; trans_t transx; @@ -158,6 +160,9 @@ void libblis_test_subm_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index 636a16e9c..300b054a1 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -59,7 +59,7 @@ void libblis_test_subv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -139,7 +139,7 @@ void libblis_test_subv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,6 +150,8 @@ void libblis_test_subv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -158,6 +160,9 @@ void libblis_test_subv_experiment obj_t x, y; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 65bda9634..3f2615b2f 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -59,7 +59,7 @@ void libblis_test_symm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,7 +154,7 @@ void libblis_test_symm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -168,6 +168,8 @@ void libblis_test_symm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; dim_t mn_side; @@ -180,6 +182,9 @@ void libblis_test_symm_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index 82e932d81..7e06388d7 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -59,7 +59,7 @@ void libblis_test_symv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -149,7 +149,7 @@ void libblis_test_symv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -163,6 +163,8 @@ void libblis_test_symv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -173,6 +175,9 @@ void libblis_test_symv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index c4a8f45a8..12d1e60f6 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -59,7 +59,7 @@ void libblis_test_syr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -146,7 +146,7 @@ void libblis_test_syr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -160,6 +160,8 @@ void libblis_test_syr_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -169,6 +171,9 @@ void libblis_test_syr_experiment obj_t a_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index 9b2d59098..e28a4fdd0 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -59,7 +59,7 @@ void libblis_test_syr2_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -148,7 +148,7 @@ void libblis_test_syr2_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -162,6 +162,8 @@ void libblis_test_syr2_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -171,6 +173,9 @@ void libblis_test_syr2_experiment obj_t a_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 39405fda5..2aa9754ba 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -59,7 +59,7 @@ void libblis_test_syr2k_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -152,7 +152,7 @@ void libblis_test_syr2k_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -166,6 +166,8 @@ void libblis_test_syr2k_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, k; uplo_t uploc; @@ -175,6 +177,9 @@ void libblis_test_syr2k_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index 621bd7c81..4e450ad03 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -59,7 +59,7 @@ void libblis_test_syrk_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,7 +150,7 @@ void libblis_test_syrk_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -164,6 +164,8 @@ void libblis_test_syrk_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, k; uplo_t uploc; @@ -173,6 +175,9 @@ void libblis_test_syrk_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index a3c245aef..5ee739645 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -59,7 +59,7 @@ void libblis_test_trmm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,7 +150,7 @@ void libblis_test_trmm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -164,6 +164,8 @@ void libblis_test_trmm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; dim_t mn_side; @@ -176,6 +178,9 @@ void libblis_test_trmm_experiment obj_t b_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index caf8269b5..494c7ef84 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -59,7 +59,7 @@ void libblis_test_trmm3_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,7 +154,7 @@ void libblis_test_trmm3_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -168,6 +168,8 @@ void libblis_test_trmm3_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; dim_t mn_side; @@ -181,6 +183,9 @@ void libblis_test_trmm3_experiment obj_t c_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 7b9143f92..bd39d30e1 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -59,7 +59,7 @@ void libblis_test_trmv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -145,7 +145,7 @@ void libblis_test_trmv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -159,6 +159,8 @@ void libblis_test_trmv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -169,6 +171,9 @@ void libblis_test_trmv_experiment obj_t x_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index e571ac0bb..23cb1e5b5 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -59,7 +59,7 @@ void libblis_test_trsm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -150,7 +150,7 @@ void libblis_test_trsm_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -164,6 +164,8 @@ void libblis_test_trsm_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; dim_t mn_side; @@ -176,6 +178,9 @@ void libblis_test_trsm_experiment obj_t b_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index eb3c06520..555cf9fbb 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -59,7 +59,7 @@ void libblis_test_trsm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -154,7 +154,7 @@ void libblis_test_trsm_ukr_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -168,6 +168,8 @@ void libblis_test_trsm_ukr_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m, n; char sc_a = 'c'; @@ -182,9 +184,13 @@ void libblis_test_trsm_ukr_experiment cntx_t* cntx; + // Query a context. cntx = bli_gks_query_cntx(); + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Fix m and n to MR and NR, respectively. m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index bedf5039a..6bc3b220f 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -59,7 +59,7 @@ void libblis_test_trsv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -145,7 +145,7 @@ void libblis_test_trsv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -159,6 +159,8 @@ void libblis_test_trsv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; uplo_t uploa; @@ -169,6 +171,9 @@ void libblis_test_trsv_experiment obj_t x_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_xpbym.c b/testsuite/src/test_xpbym.c new file mode 100644 index 000000000..1192fdb10 --- /dev/null +++ b/testsuite/src/test_xpbym.c @@ -0,0 +1,314 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "xpbym"; +static char* o_types = "mm"; // x y +static char* p_types = "h"; // transx +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_xpbym_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_xpbym_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_xpbym_impl + ( + iface_t iface, + obj_t* x, + obj_t* beta, + obj_t* y + ); + +void libblis_test_xpbym_check + ( + test_params_t* params, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_save, + double* resid + ); + + + +void libblis_test_xpbym_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfm( tdata, params, &(op->ops->normfm) ); + libblis_test_addm( tdata, params, &(op->ops->addm) ); + libblis_test_subm( tdata, params, &(op->ops->subm) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scalm( tdata, params, &(op->ops->scalm) ); +} + + + +void libblis_test_xpbym + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l1m_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_xpbym_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_xpbym_experiment ); + } +} + + + +void libblis_test_xpbym_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, n; + + trans_t transx; + + obj_t x, beta, y; + obj_t y_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_trans( pc_str[0], &transx ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, transx, + sc_str[0], m, n, &x ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &y ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &y_save ); + + // Set beta. + if ( bli_obj_is_real( &y ) ) + bli_setsc( -2.0, 0.0, &beta ); + else + bli_setsc( 0.0, -2.0, &beta ); + + // Randomize and save y. + libblis_test_mobj_randomize( params, FALSE, &x ); + libblis_test_mobj_randomize( params, FALSE, &y ); + bli_copym( &y, &y_save ); + + // Apply the parameters. + bli_obj_set_conjtrans( transx, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &y_save, &y ); + + time = bli_clock(); + + libblis_test_xpbym_impl( iface, &x, &beta, &y ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &y ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_xpbym_check( params, &x, &beta, &y, &y_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &y_save ); +} + + + +void libblis_test_xpbym_impl + ( + iface_t iface, + obj_t* x, + obj_t* beta, + obj_t* y + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_xpbym( x, beta, y ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_xpbym_check + ( + test_params_t* params, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( y ); + num_t dt_real = bli_obj_dt_proj_to_real( y ); + + dim_t m = bli_obj_length( y ); + dim_t n = bli_obj_width( y ); + + obj_t x_temp, y_temp; + obj_t norm; + + double junk; + + // + // Pre-conditions: + // - x is randomized. + // - y_orig is randomized. + // Note: + // - alpha should have a non-zero imaginary component in the complex + // cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // y := beta * y_orig + conjx(x) + // + // is functioning correctly if + // + // normf( y - ( beta * y_orig + conjx(x) ) ) + // + // is negligible. + // + + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m, n, 0, 0, &x_temp ); + bli_obj_create( dt, m, n, 0, 0, &y_temp ); + + bli_copym( x, &x_temp ); + bli_copym( y_orig, &y_temp ); + + bli_scalm( beta, &y_temp ); + bli_addm( &x_temp, &y_temp ); + + bli_subm( &y_temp, y ); + bli_normfm( y, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &x_temp ); + bli_obj_free( &y_temp ); +} + diff --git a/testsuite/src/test_xpbym.h b/testsuite/src/test_xpbym.h new file mode 100644 index 000000000..c272b1b90 --- /dev/null +++ b/testsuite/src/test_xpbym.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_xpbym + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c index 694d19f30..8fc9b7201 100644 --- a/testsuite/src/test_xpbyv.c +++ b/testsuite/src/test_xpbyv.c @@ -59,7 +59,7 @@ void libblis_test_xpbyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -143,7 +143,7 @@ void libblis_test_xpbyv_experiment test_params_t* params, test_op_t* op, iface_t iface, - num_t datatype, + char* dc_str, char* pc_str, char* sc_str, unsigned int p_cur, @@ -157,6 +157,8 @@ void libblis_test_xpbyv_experiment double time_min = DBL_MAX; double time; + num_t datatype; + dim_t m; conj_t conjx; @@ -165,6 +167,9 @@ void libblis_test_xpbyv_experiment obj_t y_save; + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );