diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c index 831d906ca..306c6b6b1 100644 --- a/frame/2/gemv/bli_gemv_unf_var2_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -576,175 +576,225 @@ void bli_zgemv_unf_var2 ) { - AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - dcomplex* A1; - dcomplex* x1; - dcomplex* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; + AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); - // For AMD these APIS are invoked skipping intermediate framework layers - // Hence we need to ensure that cntx is set here. - bli_init_once(); - if(cntx == NULL) cntx = bli_gks_query_cntx(); + dcomplex *A1; + dcomplex *x1; + dcomplex *y1; - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_elem, &n_iter, &rs_at, &cs_at ); + dim_t i, b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; - conja = bli_extract_conj( transa ); + // Memory pool declarations for packing vector Y. + mem_t mem_bufY; + rntm_t rntm; + dcomplex *y_buf = y; + inc_t buf_incy = incy; - /* If beta is zero, use setv. Otherwise, scale by beta. */ - /* y = beta * y; */ + bli_set_dims_incs_with_trans(transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at); - /* beta=0 case is hadled by scalv internally */ - /* bli_zscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, - incy, - cntx - );*/ + conja = bli_extract_conj(transa); - // This function is invoked on all architectures including ‘generic’. - // Non-AVX platforms will use the kernels derived from the context. - if (bli_cpuid_is_avx_supported() == FALSE) + // Query the architecture ID + arch_t id = bli_arch_query_id(); + + /* + Function pointer declaration for the functions + that will be used by this API + */ + zaxpyf_ker_ft axpyf_kr_ptr; // ZAXPYF + zscal2v_ker_ft scal2v_kr_ptr; // ZSCAL2V + zscalv_ker_ft scalv_kr_ptr; // ZSCALV + zcopyv_ker_ft copyv_kr_ptr; // ZCOPYV + + /* + Boolean to check if the y has been packed + and memory needs to be freed in the end + */ + bool is_y_temp_buf_created = FALSE; + + switch (id) + { + case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: + + /* + Assign the AVX2 based kernel function pointers for + ZAXPYF, ZSCAL2V, ZSCALV, ZCOPYV and corresponding fusing + factor of ZAXPYF kernel + */ + + axpyf_kr_ptr = bli_zaxpyf_zen_int_4; + b_fuse = 4; + + scal2v_kr_ptr = bli_zscal2v_zen_int; + + scalv_kr_ptr = bli_zscalv_zen_int; + + copyv_kr_ptr = bli_zcopyv_zen_int; + + break; + default: + // For non-Zen architectures, query the context if it is NULL + if(cntx == NULL) cntx = bli_gks_query_cntx(); + + /* + Query the context for the kernel function pointers for + ZAXPYF, ZSCAL2V, ZSCALV, ZCOPYV and corresponding fusing + factor of ZAXPYF kernel + */ + axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx); + b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DCOMPLEX, BLIS_AF, cntx); + + scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCAL2V_KER, cntx); + + scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx); + + copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_COPYV_KER, cntx); + } + + /* + If alpha is equal to zero, y = beta * y + alpha * A * x + becomes y = beat * y in that case packing will be costly. + y is only scaled with SCALV and returned. + */ + if (incy > 1 && (!bli_zeq0(*alpha))) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + mem_bufY.pblk.buf = NULL; + mem_bufY.pblk.block_size = 0; + mem_bufY.buf_type = 0; + mem_bufY.size = 0; + mem_bufY.pool = NULL; + + /* + In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm + */ + + bli_rntm_init_from_global(&rntm); + bli_rntm_set_num_threads_only(1, &rntm); + bli_membrk_rntm_set_membrk(&rntm); + + // Calculate the size required for n_elem double elements in vector Y. + size_t buffer_size = n_elem * sizeof(dcomplex); + +#ifdef BLIS_ENABLE_MEM_TRACING + printf("bli_zgemv_unf_var2(): get mem pool block\n"); +#endif + + /* + Acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufY. + */ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufY); + + /* Continue packing Y if buffer memory is allocated */ + if ((bli_mem_is_alloc(&mem_bufY))) { - const num_t dt = PASTEMAC(z,type); - /* If beta is zero, use setv. Otherwise, scale by beta. */ - if ( PASTEMAC(z,eq0)( *beta ) ) - { - dcomplex* zero = PASTEMAC(z,0); - /* y = 0; */ - PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - zero, - y, incy, - cntx, - NULL - ); - } - else - { - /* y = beta * y; */ - PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF) - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - cntx, - NULL - ); - } + y_buf = bli_mem_buffer(&mem_bufY); + buf_incy = 1; - PASTECH(z,axpyf_ker_ft) kfp_af; + // Invoke the ZSCAL2V function using the function pointer + scal2v_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + y_buf, buf_incy, + NULL + ); - /* Query the context for the kernel function pointer and fusing factor. */ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - kfp_af - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; + /* + Set y is packed as the memory allocation was + successful and contents have been copied + */ + is_y_temp_buf_created = TRUE; } - - bli_zscalv_ex + } + else + { + // Invoke the ZSCALV function using the function pointer + scalv_kr_ptr ( BLIS_NO_CONJUGATE, n_elem, beta, + y_buf, buf_incy, + NULL + ); + } + + // If alpha is zero(0), we only need to scalv y and return + if (bli_zeq0(*alpha)) + { + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + + // Return early for alpha is zero(0) + return; + } + + for (i = 0; i < n_iter; i += f) + { + f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse); + A1 = a + (0) * rs_at + (i)*cs_at; + x1 = x + (i)*incx; + y1 = y_buf + (0) * buf_incy; + + // Invoke the ZAXPYF function using the function pointer + axpyf_kr_ptr + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, buf_incy, + cntx + ); + } + + // Check if temp y buffer was used for compute + if (is_y_temp_buf_created) + { + // Store the result from unit strided y_buf to non-unit strided Y + // Invoke the ZCOPYV function using the function pointer + copyv_kr_ptr + ( + BLIS_NO_CONJUGATE, + n_elem, + y_buf, buf_incy, y, incy, - cntx, NULL ); - if( bli_zeq0( *alpha ) ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; - } +#ifdef BLIS_ENABLE_MEM_TRACING + printf("bli_zgemv_unf_var2(): releasing mem pool block\n"); +#endif - // for non-unit incx, incy and rs_at and conjugate will be added in the next patch - if( (incx == 1 && incy == 1 && rs_at == 1 ) && - !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa)) - { - // This gemv code deals with the followint conditions only - // 1. incx, incy, and row stride equal to one - // 2. Non conjugate A matrix and X vector - // 3. No Transpose for A Martix - // Rest is taken care by the else part (axpyf implementation) - bli_zgemv_zen_int_4x4 - ( - conja, - conjx, - m, - n, - alpha, - a, rs_at, cs_at, - x, incx, - beta, - y, incy, - cntx - ); - } - else - { - /* fusing factor */ - b_fuse = 4; + // Return the buffer to pool + bli_membrk_release(&rntm, &mem_bufY); + } - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - A1 = a + (0 )*rs_at + (i )*cs_at; - x1 = x + (i )*incx; - y1 = y + (0 )*incy; - - /* y = y + alpha * A1 * x1; */ - bli_zaxpyf_zen_int_4 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, rs_at, cs_at, - x1, incx, - y1, incy, - cntx - ); - } - } - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); } void bli_cgemv_unf_var2 diff --git a/kernels/zen/1/bli_scal2v_zen_int.c b/kernels/zen/1/bli_scal2v_zen_int.c index 503b23f56..df3524774 100644 --- a/kernels/zen/1/bli_scal2v_zen_int.c +++ b/kernels/zen/1/bli_scal2v_zen_int.c @@ -137,6 +137,8 @@ void bli_zscal2v_zen_int /* If alpha is zero, use setv. */ dcomplex *zero = PASTEMAC(z, 0); + if(cntx == NULL) cntx = bli_gks_query_cntx(); + /* Query the context for the kernel function pointer. */ const num_t dt = PASTEMAC(z, type);