mirror of
https://github.com/amd/blis.git
synced 2026-05-11 01:30:00 +00:00
Implemented optimal AVX512-variant of f32 LPGEMV
1. The 5 LOOP LPGEMM path is in-efficient when A or B is a vector (i.e, m == 1 or n == 1). 2. An efficient implementation of lpgemv_rowvar_f32 is developed considering the b matrix reorder in case of m=1 and post-ops fusion. 3. When m = 1 the algorithm divide the GEMM workload in n dimension intelligently at a granularity of NR. Each thread work on A:1xk B:kx(>=NR) and produce C=1x(>NR). K is unrolled by 4 along with remainder loop. 4. When n = 1 the algorithm divide the GEMM workload in m dimension intelligently at a granularity of MR. Each thread work on A:(>=MR)xk B:kx1 and produce C = (>=MR)x1. When n=1 reordering of B is avoided to efficiently process in n one kernel. 5. Fixed few warnings while loading 2 f32 bias elements using _mm_load_sd using float pointer. Typecasted to (const double *) AMD-Internal: [SWLCSG-2391, SWLCSG-2353] Change-Id: If1d0b8d59e0278f5f16b499de1d629e63da5b599
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -71,4 +71,33 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32);
|
||||
LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32);
|
||||
LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32);
|
||||
LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16);
|
||||
#endif // LPGEMM_5LOOP_INTF_H
|
||||
|
||||
#define LPGEMV(A_type, B_type, C_type, LP_SFX) \
|
||||
void lpgemv_rowvar_ ## LP_SFX \
|
||||
( \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
const dim_t k, \
|
||||
const A_type *a, \
|
||||
const dim_t rs_a, \
|
||||
const dim_t cs_a, \
|
||||
const AOCL_MEMORY_TAG mtag_a, \
|
||||
const B_type *b, \
|
||||
const dim_t rs_b, \
|
||||
const dim_t cs_b, \
|
||||
const AOCL_MEMORY_TAG mtag_b, \
|
||||
C_type *c, \
|
||||
const dim_t rs_c, \
|
||||
const dim_t cs_c, \
|
||||
const C_type alpha, \
|
||||
const C_type beta, \
|
||||
rntm_t *rntm, \
|
||||
lpgemm_thrinfo_t *thread, \
|
||||
lpgemm_cntx_t *lcntx, \
|
||||
lpgemm_post_op *post_op_list, \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
|
||||
LPGEMV(float, float, float, f32f32f32of32);
|
||||
|
||||
#endif // LPGEMM_5LOOP_INTF_H
|
||||
Reference in New Issue
Block a user