AVX2 dgemm kernel optimization for AOCC

Details: k0 is always positive in bli_dgemm_haswell_asm_6x8(), the operation involved with
     k0 is typecasted to uint64_t to enable AOCC generate optimized code.
     Thanks for Jini Susan (jinisusan.george@amd.com) from compiler team for suggesting
     this change. Similar change was applied to sgemm, cgemm and zgemm kernels.
Change-Id: I423c949e0c1835652142a6931dadf4a7d190aeb9
This commit is contained in:
Kiran Varaganti
2022-11-14 07:14:24 +00:00
parent 9e8595356f
commit d8d4499e54

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.All rights reserved.
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc.All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -95,8 +95,8 @@ void bli_sgemm_haswell_asm_6x16
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t k_iter = (uint64_t)k0 / 4;
uint64_t k_left = (uint64_t)k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
@@ -957,8 +957,8 @@ void bli_dgemm_haswell_asm_6x8
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t k_iter = (uint64_t)k0/4;
uint64_t k_left = (uint64_t)k0%4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
@@ -1720,8 +1720,8 @@ void bli_cgemm_haswell_asm_3x8
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t k_iter = (uint64_t)k0 / 4;
uint64_t k_left = (uint64_t)k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
@@ -2249,8 +2249,8 @@ void bli_zgemm_haswell_asm_3x4
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t k_iter = (uint64_t)k0 / 4;
uint64_t k_left = (uint64_t)k0 % 4;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;