diff --git a/config/bgq/bli_kernel.h b/config/bgq/bli_kernel.h index a4fccc13a..6d8593515 100644 --- a/config/bgq/bli_kernel.h +++ b/config/bgq/bli_kernel.h @@ -144,25 +144,7 @@ // -- Default fusing factors for level-1f operations -- -#define BLIS_DEFAULT_1F_S 8 -#define BLIS_DEFAULT_1F_D 8 -#define BLIS_DEFAULT_1F_C 4 -#define BLIS_DEFAULT_1F_Z 2 - -#define BLIS_DEFAULT_AF_S BLIS_DEFAULT_1F_S -#define BLIS_DEFAULT_AF_D BLIS_DEFAULT_1F_D -#define BLIS_DEFAULT_AF_C BLIS_DEFAULT_1F_C -#define BLIS_DEFAULT_AF_Z BLIS_DEFAULT_1F_Z - -#define BLIS_DEFAULT_DF_S BLIS_DEFAULT_1F_S -#define BLIS_DEFAULT_DF_D BLIS_DEFAULT_1F_D -#define BLIS_DEFAULT_DF_C BLIS_DEFAULT_1F_C -#define BLIS_DEFAULT_DF_Z BLIS_DEFAULT_1F_Z - -#define BLIS_DEFAULT_XF_S BLIS_DEFAULT_1F_S -#define BLIS_DEFAULT_XF_D BLIS_DEFAULT_1F_D -#define BLIS_DEFAULT_XF_C BLIS_DEFAULT_1F_C -#define BLIS_DEFAULT_XF_Z BLIS_DEFAULT_1F_Z +#define BLIS_DEFAULT_AF_D 8 diff --git a/config/template/kernels/3/bli_gemm_opt_mxn.c b/config/template/kernels/3/bli_gemm_opt_mxn.c index 79da4b1ac..5fda378e0 100644 --- a/config/template/kernels/3/bli_gemm_opt_mxn.c +++ b/config/template/kernels/3/bli_gemm_opt_mxn.c @@ -93,129 +93,11 @@ void bli_dgemm_opt_mxn where A1 is MR x k, B1 is k x NR, C11 is MR x NR, and alpha and beta are scalars. - Parameters: + For more info, please refer to the BLIS website's wiki on kernels: - - k: The number of columns of A1 and rows of B1. - - alpha: The address of a scalar to the A1 * B1 product. - - a1: The address of a micro-panel of matrix A of dimension MR x k, - stored by columns with leading dimension PACKMR, where - typically PACKMR = MR. - - b1: The address of a micro-panel of matrix B of dimension k x NR, - stored by rows with leading dimension PACKNR, where typically - PACKNR = NR. - - beta: The address of a scalar to the input value of matrix C11. - - c11: The address of a submatrix C11 of dimension MR x NR, stored - according to rs_c and cs_c. - - rs_c: The row stride of matrix C11 (ie: the distance to the next row, - in units of matrix elements). - - cs_c: The column stride of matrix C11 (ie: the distance to the next - column, in units of matrix elements). - - data: The address of an auxinfo_t object that contains auxiliary - information that may be useful when optimizing the gemm - micro-kernel implementation. (See BLIS KernelsHowTo wiki for - more info.) - - cntx: The address of the runtime context. The context can be queried - for implementation-specific values such as cache and register - blocksizes. However, most micro-kernels intrinsically "know" - these values already, and thus the cntx argument usually can - be safely ignored. (The following template micro-kernel code - does in fact query MR, NR, PACKMR, and PACKNR, as needed, but - only because those values are not hard-coded, as they would be - in a typical optimized micro-kernel implementation.) + https://github.com/flame/blis/wiki/KernelsHowTo - Diagram for gemm - - The diagram below shows the packed micro-panel operands and how elements - of each would be stored when MR = NR = 4. The hex digits indicate the - layout and order (but NOT the numeric contents) of the elements in - memory. Note that the storage of C11 is not shown since it is determined - by the row and column strides of C11. - - c11: a1: b1: - _______ ______________________ _______ - | | |0 4 8 C | |0 1 2 3| - MR | | |1 5 9 D . . . | |4 5 6 7| - | | += |2 6 A E | |8 9 A B| - |_______| |3_7_B_F_______________| |C D E F| - | . | - NR k | . | k - | . | - | | - | | - |_______| - - NR - Implementation Notes for gemm - - - Register blocksizes. The C preprocessor macros bli_?mr and bli_?nr - evaluate to the MR and NR register blocksizes for the datatype - corresponding to the '?' character. These values are abbreviations - of the macro constants BLIS_DEFAULT_MR_? and BLIS_DEFAULT_NR_?, - which are defined in the bli_kernel.h header file of the BLIS - configuration. - - Leading dimensions of a1 and b1: PACKMR and PACKNR. The packed - micro-panels a1 and b1 are simply stored in column-major and row-major - order, respectively. Usually, the width of either micro-panel (ie: - the number of rows of A1, or MR, and the number of columns of B1, or - NR) is equal to that micro-panel's so-called "leading dimension." - Sometimes, it may be beneficial to specify a leading dimension that - is larger than the panel width. This may be desirable because it - allows each column of A1 or row of B1 to maintain a certain alignment - in memory that would not otherwise be maintained by MR and/or NR. In - this case, you should index through a1 and b1 using the values PACKMR - and PACKNR, respectively, as defined by bli_?packmr and bli_?packnr. - These values are defined as BLIS_PACKDIM_MR_? and BLIS_PACKDIM_NR_?, - respectively, in the bli_kernel.h header file of the BLIS - configuration. - - Storage preference of c11: Sometimes, an optimized micro-kernel will - have a preferred storage format for C11--typically either contiguous - row-storage or contiguous column-storage. This preference comes from - how the micro-kernel is most efficiently able to load/store elements - of C11 from/to memory. Most micro-kernels use vector instructions to - load and store contigous columns (or column segments) of C11. However, - the developer may decide that loading contiguous rows (or row - segments) is desirable. If this is the case, this preference should be - noted in bli_kernel.h by defining the macro - BLIS_?GEMM_UKERNEL_PREFERS_CONTIG_ROWS. Leaving the macro undefined - leaves the default assumption (contiguous column preference) in - place. Setting this macro allows the framework to perform a minor - optimization at run-time that will ensure the micro-kernel preference - is honored, if at all possible. - - Edge cases in MR, NR dimensions. Sometimes the micro-kernel will be - called with micro-panels a1 and b1 that correspond to edge cases, - where only partial results are needed. Zero-padding is handled - automatically by the packing function to facilitate reuse of the same - micro-kernel. Similarly, the logic for computing to temporary storage - and then saving only the elements that correspond to elements of C11 - that exist (at the edges) is handled automatically within the - macro-kernel. - - Alignment of a1 and b1. By default, the alignment of addresses a1 and - b1 are aligned only to sizeof(type). If BLIS_CONTIG_ADDR_ALIGN_SIZE is - set to some larger multiple of sizeof(type), such as the page size, - then a1 and b1 will be aligned to PACKMR * sizeof(type) and PACKNR * - sizeof(type), respectively. Alignment of a1 and b1 is also affected - by BLIS_UPANEL_A_ALIGN_SIZE_? and BLIS_UPANEL_B_ALIGN_SIZE_?, which - align the distance (stride) between subsequent micro-panels. (By - default, those values are simply sizeof(type), in which case they have - no effect.) - - Unrolling loops. As a general rule of thumb, the loop over k is - sometimes moderately unrolled; for example, in our experience, an - unrolling factor of u = 4 is fairly common. If unrolling is applied - in the k dimension, edge cases must be handled to support values of k - that are not multiples of u. It is nearly universally true that there - should be no loops in the MR or NR directions; in other words, - iteration over these dimensions should always be fully unrolled - (within the loop over k). - - Zero beta. If beta = 0.0 (or 0.0 + 0.0i for complex datatypes), then - the micro-kernel should NOT use it explicitly, as C11 may contain - uninitialized memory (including NaNs). This case should be detected - and handled separately, preferably by simply overwriting C11 with the - alpha * A1 * B1 product. An example of how to perform this "beta equals - zero" handling is included in the gemm micro-kernel associated with - the template configuration. - - For more info, please refer to the BLIS website and/or contact the - blis-devel mailing list. + and/or contact the blis-devel mailing list. -FGVZ */ diff --git a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c index 1ae61aa97..edad8be36 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c @@ -104,118 +104,11 @@ void bli_dgemmtrsm_l_opt_mxn B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix inverse. - Parameters: + For more info, please refer to the BLIS website's wiki on kernels: - - k: The number of columns of A10 and rows of B01. - - alpha: The address of a scalar to be applied to B11. - - a10: The address of A10, which is the MR x k submatrix of the packed - micro-panel of A that is situated to the left of the MR x MR - triangular submatrix A11. A10 is stored by columns with leading - dimension PACKMR, where typically PACKMR = MR. - - a11: The address of A11, which is the MR x MR lower triangular - submatrix within the packed micro-panel of matrix A that is - situated to the right of A10. A11 is stored by columns with - leading dimension PACKMR, where typically PACKMR = MR. Note - that A11 contains elements in both triangles, though elements - in the unstored triangle are not guaranteed to be zero and - thus should not be referenced. - - b01: The address of B01, which is the k x NR submatrix of the packed - micro-panel of B that is situated above the MR x NR submatrix - B11. B01 is stored by rows with leading dimension PACKNR, where - typically PACKNR = NR. - - b11: The address B11, which is the MR x NR submatrix of the packed - micro-panel of B, situated below B01. B11 is stored by rows - with leading dimension PACKNR, where typically PACKNR = NR. - - c11: The address of C11, which is the MR x NR submatrix of matrix - C, stored according to rs_c and cs_c. C11 is the submatrix - within C that corresponds to the elements which were packed - into B11. Thus, C is the original input matrix B to the overall - trsm operation. - - rs_c: The row stride of C11 (ie: the distance to the next row of C11, - in units of matrix elements). - - cs_c: The column stride of C11 (ie: the distance to the next column of - C11, in units of matrix elements). - - data: The address of an auxinfo_t object that contains auxiliary - information that may be useful when optimizing the gemmtrsm - micro-kernel implementation. (See BLIS KernelsHowTo wiki for - more info.) - - cntx: The address of the runtime context. The context can be queried - for implementation-specific values such as cache and register - blocksizes. However, most micro-kernels intrinsically "know" - these values already, and thus the cntx argument usually can - be safely ignored. (The following template micro-kernel code - does in fact query MR, NR, PACKMR, and PACKNR, as needed, but - only because those values are not hard-coded, as they would be - in a typical optimized micro-kernel implementation.) + https://github.com/flame/blis/wiki/KernelsHowTo - Diagram for gemmtrsm_l - - The diagram below shows the packed micro-panel operands for trsm_l and - how elements of each would be stored when MR = NR = 4. (The hex digits - indicate the layout and order (but NOT the numeric contents) in memory. - Here, matrix A11 (referenced by a11) is lower triangular. Matrix A11 - does contain elements corresponding to the strictly upper triangle, - however, they are not guaranteed to contain zeros and thus these elements - should not be referenced. - - NR - _______ - b01:|0 1 2 3| - |4 5 6 7| - |8 9 A B| - |C D E F| - k | . | - | . | - a10: a11: | . | - ___________________ _______ |_______| - |0 4 8 C |`. | b11:| | - MR |1 5 9 D . . . | `. | | | - |2 6 A E | `. | MR | | - |3_7_B_F____________|______`.| |_______| - - k MR - - - Implementation Notes for gemmtrsm - - - Register blocksizes. See Implementation Notes for gemm. - - Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation - Notes for gemm. - - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a1 and b1. The addresses a1 and b1 are aligned according - to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively. - - Unrolling loops. Most optimized implementations should unroll all - three loops within the trsm subproblem of gemmtrsm. See Implementation - Notes for gemm for remarks on unrolling the gemm subproblem. - - Prefetching next micro-panels of A and B. When invoked from within a - gemmtrsm_l micro-kernel, the addresses accessible via - bli_auxinfo_next_a() and bli_auxinfo_next_b() refer to the next - invocation's a10 and b01, respectively, while in gemmtrsm_u, the - _next_a() and _next_b() macros return the addresses of the next - invocation's a11 and b11 (since those submatrices precede a12 and b21). - (See BLIS KernelsHowTo wiki for more info.) - - Zero alpha. The micro-kernel can safely assume that alpha is non-zero; - "alpha equals zero" handling is performed at a much higher level, - which means that, in such a scenario, the micro-kernel will never get - called. - - Diagonal elements of A11. See Implementation Notes for trsm. - - Zero elements of A11. See Implementation Notes for trsm. - - Output. See Implementation Notes for trsm. - - Optimization. Let's assume that the gemm micro-kernel has already been - optimized. You have two options with regard to optimizing the fused - gemmtrsm micro-kernels: - (1) Optimize only the trsm micro-kernels. This will result in the gemm - and trsm_l micro-kernels being called in sequence. (Likewise for - gemm and trsm_u.) - (2) Fuse the implementation of the gemm micro-kernel with that of the - trsm micro-kernels by inlining both into the gemmtrsm_l and - gemmtrsm_u micro-kernel definitions. This option is more labor- - intensive, but also more likely to yield higher performance because - it avoids redundant memory operations on the packed MR x NR - submatrix B11. - - For more info, please refer to the BLIS website and/or contact the - blis-devel mailing list. + and/or contact the blis-devel mailing list. -FGVZ */ diff --git a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c index 58616a644..b0fd27b1e 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c @@ -104,117 +104,13 @@ void bli_dgemmtrsm_u_opt_mxn B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix inverse. - Parameters: + For more info, please refer to the BLIS website's wiki on kernels: - - k: The number of columns of A12 and rows of B21. - - alpha: The address of a scalar to be applied to B11. - - a12: The address of A12, which is the MR x k submatrix of the packed - micro-panel of A that is situated to the right of the MR x MR - triangular submatrix A11. A12 is stored by columns with leading - dimension PACKMR, where typically PACKMR = MR. - - a11: The address of A11, which is the MR x MR upper triangular - submatrix within the packed micro-panel of matrix A that is - situated to the left of A12. A11 is stored by columns with - leading dimension PACKMR, where typically PACKMR = MR. Note - that A11 contains elements in both triangles, though elements - in the unstored triangle are not guaranteed to be zero and - thus should not be referenced. - - b21: The address of B21, which is the k x NR submatrix of the packed - micro-panel of B that is situated above the MR x NR submatrix - B11. B01 is stored by rows with leading dimension PACKNR, where - typically PACKNR = NR. - - b11: The address B11, which is the MR x NR submatrix of the packed - micro-panel of B, situated below B01. B11 is stored by rows - with leading dimension PACKNR, where typically PACKNR = NR. - - c11: The address of C11, which is the MR x NR submatrix of matrix - C, stored according to rs_c and cs_c. C11 is the submatrix - within C that corresponds to the elements which were packed - into B11. Thus, C is the original input matrix B to the overall - trsm operation. - - rs_c: The row stride of C11 (ie: the distance to the next row of C11, - in units of matrix elements). - - cs_c: The column stride of C11 (ie: the distance to the next column of - C11, in units of matrix elements). - - data: The address of an auxinfo_t object that contains auxiliary - information that may be useful when optimizing the gemmtrsm - micro-kernel implementation. (See BLIS KernelsHowTo wiki for - more info.) - - cntx: The address of the runtime context. The context can be queried - for implementation-specific values such as cache and register - blocksizes. However, most micro-kernels intrinsically "know" - these values already, and thus the cntx argument usually can - be safely ignored. (The following template micro-kernel code - does in fact query MR, NR, PACKMR, and PACKNR, as needed, but - only because those values are not hard-coded, as they would be - in a typical optimized micro-kernel implementation.) + https://github.com/flame/blis/wiki/KernelsHowTo - Diagram for gemmtrsm_u - - The diagram below shows the packed micro-panel operands for trsm_l and - how elements of each would be stored when MR = NR = 4. (The hex digits - indicate the layout and order (but NOT the numeric contents) in memory. - Here, matrix A11 (referenced by a11) is upper triangular. Matrix A11 - does contain elements corresponding to the strictly lower triangle, - however, they are not guaranteed to contain zeros and thus these elements - should not be referenced. - - a11: a12: NR - ________ ___________________ _______ - |`. |0 4 8 | b11:|0 1 2 3| - MR | `. |1 5 9 . . . | |4 5 6 7| - | `. |2 6 A | MR |8 9 A B| - |______`.|3_7_B______________| |___.___| - b21:| . | - MR k | . | - | | - | | - NOTE: Storage digits are shown k | | - starting with a12 to avoid | | - obscuring triangular structure | | - of a11. |_______| - - - Implementation Notes for gemmtrsm - - - Register blocksizes. See Implementation Notes for gemm. - - Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation - Notes for gemm. - - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a1 and b1. The addresses a1 and b1 are aligned according - to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively. - - Unrolling loops. Most optimized implementations should unroll all - three loops within the trsm subproblem of gemmtrsm. See Implementation - Notes for gemm for remarks on unrolling the gemm subproblem. - - Prefetching next micro-panels of A and B. When invoked from within a - gemmtrsm_l micro-kernel, the addresses accessible via - bli_auxinfo_next_a() and bli_auxinfo_next_b() refer to the next - invocation's a10 and b01, respectively, while in gemmtrsm_u, the - _next_a() and _next_b() macros return the addresses of the next - invocation's a11 and b11 (since those submatrices precede a12 and b21). - (See BLIS KernelsHowTo wiki for more info.) - - Zero alpha. The micro-kernel can safely assume that alpha is non-zero; - "alpha equals zero" handling is performed at a much higher level, - which means that, in such a scenario, the micro-kernel will never get - called. - - Diagonal elements of A11. See Implementation Notes for trsm. - - Zero elements of A11. See Implementation Notes for trsm. - - Output. See Implementation Notes for trsm. - - Optimization. Let's assume that the gemm micro-kernel has already been - optimized. You have two options with regard to optimizing the fused - gemmtrsm micro-kernels: - (1) Optimize only the trsm micro-kernels. This will result in the gemm - and trsm_l micro-kernels being called in sequence. (Likewise for - gemm and trsm_u.) - (2) Fuse the implementation of the gemm micro-kernel with that of the - trsm micro-kernels by inlining both into the gemmtrsm_l and - gemmtrsm_u micro-kernel definitions. This option is more labor- - intensive, but also more likely to yield higher performance because - it avoids redundant memory operations on the packed MR x NR - submatrix B11. - - For more info, please refer to the BLIS website and/or contact the - blis-devel mailing list. + and/or contact the blis-devel mailing list. + -FGVZ */ const num_t dt = BLIS_DOUBLE; diff --git a/config/template/kernels/3/bli_trsm_l_opt_mxn.c b/config/template/kernels/3/bli_trsm_l_opt_mxn.c index a28760b88..d9513f1fe 100644 --- a/config/template/kernels/3/bli_trsm_l_opt_mxn.c +++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.c @@ -84,77 +84,11 @@ void bli_dtrsm_l_opt_mxn where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is MR x NR. - Parameters: + For more info, please refer to the BLIS website's wiki on kernels: - - a11: The address of A11, which is the MR x MR lower triangular - submatrix within the packed micro-panel of matrix A. A11 is - stored by columns with leading dimension PACKMR, where - typically PACKMR = MR. Note that A11 contains elements in both - triangles, though elements in the unstored triangle are not - guaranteed to be zero and thus should not be referenced. - - b11: The address of B11, which is an MR x NR submatrix of the - packed micro-panel of B. B11 is stored by rows with leading - dimension PACKNR, where typically PACKNR = NR. - - c11: The address of C11, which is an MR x NR submatrix of matrix C, - stored according to rs_c and cs_c. C11 is the submatrix within - C that corresponds to the elements which were packed into B11. - Thus, C is the original input matrix B to the overall trsm - operation. - - rs_c: The row stride of C11 (ie: the distance to the next row of C11, - in units of matrix elements). - - cs_c: The column stride of C11 (ie: the distance to the next column of - C11, in units of matrix elements). - - data: The address of an auxinfo_t object that contains auxiliary - information that may be useful when optimizing the trsm - micro-kernel implementation. (See BLIS KernelsHowTo wiki for - more info.) - - cntx: The address of the runtime context. The context can be queried - for implementation-specific values such as cache and register - blocksizes. However, most micro-kernels intrinsically "know" - these values already, and thus the cntx argument usually can - be safely ignored. (The following template micro-kernel code - does in fact query MR, NR, PACKMR, and PACKNR, as needed, but - only because those values are not hard-coded, as they would be - in a typical optimized micro-kernel implementation.) + https://github.com/flame/blis/wiki/KernelsHowTo - Diagrams for trsm - - Please see the diagram for gemmtrsm_l to see depiction of the trsm_l and - where it fits in with its preceding gemm subproblem. - - Implementation Notes for trsm - - - Register blocksizes. See Implementation Notes for gemm. - - Leading dimensions of a11 and b11: PACKMR and PACKNR. See - Implementation Notes for gemm. - - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a11 and b11. See Implementation Notes for gemmtrsm. - - Unrolling loops. Most optimized implementations should unroll all - three loops within the trsm micro-kernel. - - Prefetching next micro-panels of A and B. We advise against using - the bli_auxinfo_next_a() and bli_auxinfo_next_b() macros from within - the trsm_l and trsm_u micro-kernels, since the values returned usually - only make sense in the context of the overall gemmtrsm subproblem. - - Diagonal elements of A11. At the time this micro-kernel is called, - the diagonal entries of triangular matrix A11 contain the inverse of - the original elements. This inversion is done during packing so that - we can avoid expensive division instructions within the micro-kernel - itself. If the diag parameter to the higher level trsm operation was - equal to BLIS_UNIT_DIAG, the diagonal elements will be explicitly - unit. - - Zero elements of A11. Since A11 is lower triangular (for trsm_l), the - strictly upper triangle implicitly contains zeros. Similarly, the - strictly lower triangle of A11 implicitly contains zeros when A11 is - upper triangular (for trsm_u). However, the packing function may or - may not actually write zeros to this region. Thus, while the - implementation may reference these elements, it should not use them - in any computation. - - Output. This micro-kernel must write its result to two places: the - submatrix B11 of the current packed micro-panel of B and the submatrix - C11 of the output matrix C. - - For more info, please refer to the BLIS website and/or contact the - blis-devel mailing list. + and/or contact the blis-devel mailing list. -FGVZ */ diff --git a/config/template/kernels/3/bli_trsm_u_opt_mxn.c b/config/template/kernels/3/bli_trsm_u_opt_mxn.c index ba0b46753..ce68db673 100644 --- a/config/template/kernels/3/bli_trsm_u_opt_mxn.c +++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.c @@ -89,76 +89,11 @@ void bli_dtrsm_u_opt_mxn( where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is MR x NR. - Parameters: + For more info, please refer to the BLIS website's wiki on kernels: - - a11: The address of A11, which is the MR x MR upper triangular - submatrix within the packed micro-panel of matrix A. A11 is - stored by columns with leading dimension PACKMR, where - typically PACKMR = MR. Note that A11 contains elements in both - triangles, though elements in the unstored triangle are not - guaranteed to be zero and thus should not be referenced. - - b11: The address of B11, which is an MR x NR submatrix of the - packed micro-panel of B. B11 is stored by rows with leading - dimension PACKNR, where typically PACKNR = NR. - - c11: The address of C11, which is an MR x NR submatrix of matrix C, - stored according to rs_c and cs_c. C11 is the submatrix within - C that corresponds to the elements which were packed into B11. - Thus, C is the original input matrix B to the overall trsm - operation. - - rs_c: The row stride of C11 (ie: the distance to the next row of C11, - in units of matrix elements). - - cs_c: The column stride of C11 (ie: the distance to the next column of - C11, in units of matrix elements). - - data: The address of an auxinfo_t object that contains auxiliary - information that may be useful when optimizing the trsm - micro-kernel implementation. (See BLIS KernelsHowTo wiki for - more info.) - - cntx: The address of the runtime context. The context can be queried - for implementation-specific values such as cache and register - blocksizes. However, most micro-kernels intrinsically "know" - these values already, and thus the cntx argument usually can - be safely ignored. (The following template micro-kernel code - does in fact query MR, NR, PACKMR, and PACKNR, as needed, but - only because those values are not hard-coded, as they would be - in a typical optimized micro-kernel implementation.) + https://github.com/flame/blis/wiki/KernelsHowTo - Diagrams for trsm - - Please see the diagram for gemmtrsm_u to see depiction of the trsm_u and - where it fits in with its preceding gemm subproblem. - - Implementation Notes for trsm - - - Register blocksizes. See Implementation Notes for gemm. - - Leading dimensions of a11 and b11: PACKMR and PACKNR. See - Implementation Notes for gemm. - - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a11 and b11. See Implementation Notes for gemmtrsm. - - Unrolling loops. Most optimized implementations should unroll all - three loops within the trsm micro-kernel. - - Prefetching next micro-panels of A and B. We advise against using - the bli_auxinfo_next_a() and bli_auxinfo_next_b() macros from within - the trsm_l and trsm_u micro-kernels, since the values returned usually - only make sense in the context of the overall gemmtrsm subproblem. - - Diagonal elements of A11. At the time this micro-kernel is called, - the diagonal entries of triangular matrix A11 contain the inverse of - the original elements. This inversion is done during packing so that - we can avoid expensive division instructions within the micro-kernel - itself. If the diag parameter to the higher level trsm operation was - equal to BLIS_UNIT_DIAG, the diagonal elements will be explicitly - unit. - - Zero elements of A11. Since A11 is lower triangular (for trsm_l), the - strictly upper triangle implicitly contains zeros. Similarly, the - strictly lower triangle of A11 implicitly contains zeros when A11 is - upper triangular (for trsm_u). However, the packing function may or - may not actually write zeros to this region. Thus, the implementation - should not reference these elements. - - Output. This micro-kernel must write its result to two places: the - submatrix B11 of the current packed micro-panel of B and the submatrix - C11 of the output matrix C. - - For more info, please refer to the BLIS website and/or contact the - blis-devel mailing list. + and/or contact the blis-devel mailing list. -FGVZ */ diff --git a/kernels/bgq/1f/bli_axpyf_opt_var1.c b/kernels/bgq/1f/bli_axpyf_opt_var1.c index 2af7d1e2f..5bcae61c7 100644 --- a/kernels/bgq/1f/bli_axpyf_opt_var1.c +++ b/kernels/bgq/1f/bli_axpyf_opt_var1.c @@ -48,18 +48,20 @@ void bli_daxpyf_opt_var1 cntx_t* cntx ) { + const dim_t fusefac = 8; + if ( bli_zero_dim2( m, b_n ) ) return; bool_t use_ref = FALSE; -// printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\n", b_n, PASTEMAC(d, axpyf_fusefac), inca, incx, incy, bli_is_unaligned_to(a, 32), bli_is_unaligned_to( y, 32)); +// printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\n", b_n, fusefac, inca, incx, incy, bli_is_unaligned_to(a, 32), bli_is_unaligned_to( y, 32)); // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. - if ( b_n < PASTEMAC(d,axpyf_fusefac) || inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( a, 32 ) || bli_is_unaligned_to( y, 32 ) ) + if ( b_n < fusefac) || inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( a, 32 ) || bli_is_unaligned_to( y, 32 ) ) use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref == TRUE ) { -// printf("%d\t%d\t%d\t%d\t%d\t%d\n", PASTEMAC(d, axpyf_fusefac), inca, incx, incy, bli_is_unaligned_to(a, 32), bli_is_unaligned_to( y, 32)); +// printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to(a, 32), bli_is_unaligned_to( y, 32)); // printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n"); BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy ); return;