Merge commit '4670df5ca606e6e3ee07a085ea61016489bf91ad' into develop

2026-07-19 02:01:01 +00:00 · 2026-01-03 01:41:33 +00:00
parent e64da4f3d6
commit 0b05cd0351
1 changed files with 6 additions and 8 deletions
--- a/include/ck/library/utility/device_tensor_generator.hpp
+++ b/include/ck/library/utility/device_tensor_generator.hpp
@@ -7,7 +7,6 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/library/utility/device_tensor_generator.hpp"
 #include "ck/utility/data_type.hpp"
-#include <cmath>

 // use xorshift for now since it is simple. Should be suitable enough, but feel free to switch in
 // the future
@@ -107,7 +106,7 @@ template <typename T>
 __global__ void
 fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_element_size)
 {
-    static constexpr PI = std::acos(-1.0);
+    static constexpr float PI = 3.141592653f;
    // initial values
    ran_state_u32 s = ran_init();
    float norm[2];
@@ -116,12 +115,11 @@ fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_e
    {
        if(j % (2 / ck::packed_size_v<T>) == 0)
        {
-            float u1 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
-            float u2 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
-            norm[0] =
-                sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::cos(2.0f * PI * u2) + mean;
-            norm[1] =
-                sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::sin(2.0f * PI * u2) + mean;
+            float u1    = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
+            float u2    = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
+            float scale = sigma * ck::math::sqrt(-2.0f * ck::math::log(u1));
+            norm[0]     = scale * ck::math::cos(2.0f * PI * u2) + mean;
+            norm[1]     = scale * ck::math::sin(2.0f * PI * u2) + mean;
        }

        if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)