diff --git a/include/ck/library/utility/device_tensor_generator.hpp b/include/ck/library/utility/device_tensor_generator.hpp index 80a91adda4..60bc3110d4 100644 --- a/include/ck/library/utility/device_tensor_generator.hpp +++ b/include/ck/library/utility/device_tensor_generator.hpp @@ -7,7 +7,6 @@ #include "ck/utility/common_header.hpp" #include "ck/library/utility/device_tensor_generator.hpp" #include "ck/utility/data_type.hpp" -#include // use xorshift for now since it is simple. Should be suitable enough, but feel free to switch in // the future @@ -107,7 +106,7 @@ template __global__ void fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_element_size) { - static constexpr PI = std::acos(-1.0); + static constexpr float PI = 3.141592653f; // initial values ran_state_u32 s = ran_init(); float norm[2]; @@ -116,12 +115,11 @@ fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_e { if(j % (2 / ck::packed_size_v) == 0) { - float u1 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f); - float u2 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f); - norm[0] = - sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::cos(2.0f * PI * u2) + mean; - norm[1] = - sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::sin(2.0f * PI * u2) + mean; + float u1 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f); + float u2 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f); + float scale = sigma * ck::math::sqrt(-2.0f * ck::math::log(u1)); + norm[0] = scale * ck::math::cos(2.0f * PI * u2) + mean; + norm[1] = scale * ck::math::sin(2.0f * PI * u2) + mean; } if constexpr(ck::is_same_v)