diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
index f329a8a4d3..5538716cff 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
@@ -238,6 +238,8 @@ UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args)
     };
 }
 
+
+
 /// @brief `alloc_outputs()` specialization for forward convolution.
 ///
 /// @tparam SIGNATURE Forward convolution signature.
diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp
index 1873af2882..01d9c698c9 100644
--- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp
@@ -205,6 +205,19 @@ template <auto SIGNATURE>
     requires ValidUniqueInputs<SIGNATURE>
 UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args);
 
+/// @brief Allocate inputs corresponding to a signature.
+///
+/// The `init_inputs()` function is used to initialize pseudo-random data
+/// to the tensors specified in the Inputs structure.
+///
+/// @tparam SIGNATURE the signature to specialize the structure for.
+///
+/// @see Inputs
+/// @see UniqueInputs
+template <auto SIGNATURE>
+    requires ValidUniqueInputs<SIGNATURE>
+UniqueInputs<SIGNATURE> init_inputs(const Args<SIGNATURE>& args);
+
 /// @brief Allocate outputs corresponding to a signature.
 ///
 /// The `alloc_outputs()` function is used to create an instance of
diff --git a/include/ck/library/utility/device_tensor_generator.hpp b/include/ck/library/utility/device_tensor_generator.hpp
new file mode 100644
index 0000000000..4da38bf399
--- /dev/null
+++ b/include/ck/library/utility/device_tensor_generator.hpp
@@ -0,0 +1,135 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include <hip/hip_runtime.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/library/utility/device_tensor_generator.hpp"
+#include "ck/utility/data_type.hpp"
+#include <cmath>
+
+// use xorshift for now since it is simple. Should be suitable enough, but feel free to switch in
+// the future
+struct ran_state_u32
+{
+    uint32_t s[4];
+};
+
+__device__ uint32_t ran_gen_round_u32(ran_state_u32& state)
+{
+    uint32_t tmp = state.s[3];
+    state.s[3]   = state.s[2];
+    state.s[2]   = state.s[1];
+    state.s[1]   = state.s[0];
+    tmp ^= tmp << 11;
+    tmp ^= tmp >> 8;
+    state.s[0] = tmp ^ state.s[0] ^ (state.s[0] >> 19);
+    return state.s[0];
+}
+
+__device__ ran_state_u32 ran_init(uint32_t seed = 0)
+{
+    ran_state_u32 state;
+    // use primes for initialization
+    state.s[0] = (blockDim.x * blockIdx.x + threadIdx.x) * 8912741 + 2313212 + seed;
+    state.s[1] =
+        (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 5013829 + 6012697;
+    state.s[2] = (blockDim.x * blockIdx.x + threadIdx.x) * 3412309 + 2912479;
+    state.s[3] =
+        (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 1001447 + 9912307;
+
+    // run 20 rounds
+    for(int i = 0; i < 20; i++)
+    {
+        ran_gen_round_u32(state);
+    }
+    return state;
+}
+
+template <typename T>
+__global__ void fill_tensor_uniform_rand_int_values(T* p,
+                                                    int min_value,
+                                                    int max_value,
+                                                    uint64_t buffer_element_size)
+{
+    // initial values
+    ran_state_u32 s = ran_init();
+    for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+        i < buffer_element_size / ck::packed_size_v<T>;
+        i += blockDim.x * gridDim.x)
+    {
+        if constexpr(ck::is_same_v<T, ck::pk_i4_t>)
+        {
+            uint8_t hi      = ((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value + 8;
+            uint8_t lo      = ((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value + 8;
+            ck::pk_i4_t res = ((hi & 0xf) << 4) + (lo & 0xf);
+            p[i]            = res;
+        }
+        else
+        {
+            p[i] = ck::type_convert<T, int>(
+                static_cast<int>((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value);
+        }
+    }
+}
+
+template <typename T>
+__global__ void fill_tensor_uniform_rand_fp_values(T* p,
+                                                   float min_value,
+                                                   float max_value,
+                                                   uint64_t buffer_element_size)
+{
+    // initial values
+    ran_state_u32 s = ran_init();
+    for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+        i < buffer_element_size / ck::packed_size_v<T>;
+        i += blockDim.x * gridDim.x)
+    {
+        if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
+        {
+            float u1 =
+                ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
+            float u2 =
+                ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
+
+            p[i] = ck::type_convert<ck::f4x2_t>(ck::float2_t{u1, u2});
+        }
+        else
+        {
+            float ran = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
+            p[i]      = ck::type_convert<T, float>(ran * (max_value - min_value) + min_value);
+        }
+    }
+}
+
+template <typename T>
+__global__ void
+fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_element_size)
+{
+    // initial values
+    ran_state_u32 s = ran_init();
+    float norm[2];
+    for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x, j = 0; i < buffer_element_size;
+        i += blockDim.x * gridDim.x, j++)
+    {
+        if(j % (2 / ck::packed_size_v<T>) == 0)
+        {
+            float u1 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
+            float u2 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
+            norm[0] =
+                sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::cos(2.0f * M_PI * u2) + mean;
+            norm[1] =
+                sigma * std::sqrt(-2.0f * ck::math::log(u1)) * std::sin(2.0f * M_PI * u2) + mean;
+        }
+
+        if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
+        {
+            p[i] = ck::type_convert<ck::f4x2_t>(ck::float2_t{norm[0], norm[1]});
+        }
+        else
+        {
+            p[i] = ck::type_convert<T, float>(norm[j % 2]);
+        }
+    }
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 802f29024c..87f2ec5d15 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -296,6 +296,7 @@ add_subdirectory(transpose)
 add_subdirectory(permute_scale)
 add_subdirectory(wrapper)
 add_subdirectory(quantization)
+add_subdirectory(device_memory)
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/device_memory/CMakeLists.txt b/test/device_memory/CMakeLists.txt
new file mode 100644
index 0000000000..83b851cc0b
--- /dev/null
+++ b/test/device_memory/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_custom_target(device_mem_tests)
+add_gtest_executable(test_device_prng test_device_prng.cpp)
+target_link_libraries(test_device_prng PRIVATE utility)
+add_dependencies(test_device_prng device_mem_tests)
\ No newline at end of file
diff --git a/test/device_memory/test_device_prng.cpp b/test/device_memory/test_device_prng.cpp
new file mode 100644
index 0000000000..bbc5377af1
--- /dev/null
+++ b/test/device_memory/test_device_prng.cpp
@@ -0,0 +1,225 @@
+#include <cmath>
+#include <vector>
+#include <gtest/gtest.h>
+#include <random>
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/ck.hpp"
+
+template <typename inType, typename outType>
+void convertTypeFromDevice(std::vector<inType>& fromDevice,
+                           std::vector<outType>& res,
+                           uint64_t num_elements)
+{
+    for(uint64_t i = 0; i < num_elements / ck::packed_size_v<inType>; i++)
+    {
+        // since the CPU dosen't have non-standard data types, we need to convert to float
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<inType>, ck::f4x2_pk_t>)
+        {
+            ck::float2_t tmp = ck::type_convert<ck::float2_t, ck::f4x2_t>(fromDevice[i]);
+            res[i * 2]       = tmp.x;
+            res[i * 2 + 1]   = tmp.y;
+        }
+        else if constexpr(ck::is_same_v<ck::remove_cvref_t<inType>, ck::pk_i4_t>)
+        {
+            uint8_t packed = fromDevice[i].data;
+
+            int hi         = (packed >> 4) & 0x0f;
+            int lo         = packed & 0x0f;
+            res[i * 2]     = static_cast<outType>(hi - 8);
+            res[i * 2 + 1] = static_cast<outType>(lo - 8);
+        }
+        else
+        {
+            res[i] = ck::type_convert<outType, inType>(fromDevice[i]);
+        }
+    }
+}
+
+template <typename T>
+void TDevRanUniGenInt(int min_val, int max_val, uint64_t num_elements)
+{
+
+    size_t packed_size = ck::packed_size_v<T>;
+
+    ck::DeviceMem test_buf(sizeof(T) * num_elements / packed_size);
+    std::vector<T> from_host(num_elements / packed_size);
+    std::vector<int> host_elements(num_elements);
+
+    test_buf.FillUniformRandInteger<T>(min_val, max_val);
+    test_buf.FromDevice(&from_host[0]);
+
+    uint64_t num_equal = 0;
+    bool in_range      = true;
+
+    convertTypeFromDevice(from_host, host_elements, num_elements);
+    // very basic checks: check if all data points are in range and
+    // hf data is within 6 sigma of expected value
+    for(uint64_t i = 0; i < num_elements; i++)
+    {
+        if(host_elements[i] >= max_val || host_elements[i] < min_val)
+        {
+            in_range = false;
+        }
+        if(i > 0)
+        {
+            if(host_elements[i] == host_elements[i - 1])
+            {
+                num_equal++;
+            }
+        }
+    }
+    EXPECT_TRUE(in_range);
+
+    double expected_mean =
+        (static_cast<double>(num_elements) - 1.0) / (static_cast<double>(max_val - min_val));
+    double std_dev     = std::sqrt(expected_mean);
+    double upper_bound = expected_mean + 6 * std_dev;
+    double lower_bound = expected_mean - 6 * std_dev;
+
+    // in these cases the test parameters are unsuitable
+    EXPECT_TRUE(lower_bound > 1.0);
+    EXPECT_TRUE(upper_bound < static_cast<double>(num_elements) - 2.0);
+
+    // printf("lower bound: %f upper bound: %f actual: %d\n",
+    //        lower_bound,
+    //        upper_bound,
+    //        static_cast<int>(num_equal));
+    EXPECT_TRUE(static_cast<double>(num_equal) > lower_bound);
+    EXPECT_TRUE(static_cast<double>(num_equal) < upper_bound);
+}
+
+template <typename T>
+void TDevRanUniGenFp(double min_val,
+                     double max_val,
+                     uint64_t num_elements,
+                     double std_err_tolerance = 6.0)
+{
+    size_t packed_size = ck::packed_size_v<T>;
+    ck::DeviceMem test_buf(sizeof(T) * num_elements / packed_size);
+    std::vector<T> host_buf(num_elements / packed_size);
+    std::vector<float> host_elements(num_elements);
+
+    test_buf.FillUniformRandFp<T>(min_val, max_val);
+    test_buf.FromDevice(&host_buf[0]);
+
+    bool in_range         = true;
+    double accum_mean     = 0.0;
+    double accum_variance = 0.0;
+
+    // #kabraham: with floats, we can actually do some more extensive tests,
+    //  compute mean, std_dev and std_err and compare these to expected values
+    convertTypeFromDevice(host_buf, host_elements, num_elements);
+    for(uint64_t i = 0; i < num_elements; i++)
+    {
+        if(host_elements[i] > max_val || host_elements[i] < min_val)
+        {
+            in_range = false;
+        }
+        accum_mean += host_elements[i];
+    }
+    EXPECT_TRUE(in_range);
+    EXPECT_TRUE(accum_mean != 0.0);
+    double mean = accum_mean / num_elements;
+
+    for(uint64_t i = 0; i < num_elements; i++)
+    {
+        accum_variance += std::pow(host_elements[i] - mean, 2);
+    }
+    double std_dev = std::sqrt(accum_variance) / num_elements;
+
+    double expected_mean    = (min_val + max_val) / 2.0;
+    double expected_std_dev = (max_val - min_val) / std::sqrt(12 * num_elements);
+    double std_err          = expected_std_dev / sqrt(num_elements);
+    //    printf(
+    //        "Expected: mean: %f std_dev: %f std_err : %f\n", expected_mean, expected_std_dev,
+    //        std_err);
+    //    printf("  Actual: mean: %f std_dev: %f \n", mean, std_dev);
+    EXPECT_TRUE(abs(mean - expected_mean) < 6 * expected_std_dev);
+    EXPECT_TRUE(abs(std_dev - expected_std_dev) < std_err_tolerance * std_err);
+}
+
+template <typename T>
+void TDevRanNormGenFp(double sigma,
+                      double mean,
+                      uint64_t num_elements,
+                      double ERRF_BUCKET_SIZE  = 0.1,
+                      double ERRF_BUCKET_RANGE = 3.0,
+                      double sig_tolerence     = 6.0)
+{
+    ck::DeviceMem test_buf(sizeof(T) * num_elements);
+    std::vector<T> host_buf(num_elements);
+    std::vector<float> host_elements(num_elements);
+
+    test_buf.FillNormalRandFp<T>(sigma, mean);
+    test_buf.FromDevice(&host_buf[0]);
+
+    convertTypeFromDevice(host_buf, host_elements, num_elements);
+
+    // #kabraham: compute errf buckets and compare with expected vaules
+    int ERRF_NUM_BUCKETS = 2 * ERRF_BUCKET_RANGE / ERRF_BUCKET_SIZE + 1;
+
+    std::vector<int64_t> errf_buckets(ERRF_NUM_BUCKETS, 0);
+    for(uint64_t i = 0; i < num_elements; i++)
+    {
+        for(int bucket = 0; bucket < ERRF_NUM_BUCKETS; bucket++)
+        {
+            // #kabraham: count exact hits as half (kind of relevant for utra-low-precision formats)
+            if(host_elements[i] < sigma * (-ERRF_BUCKET_RANGE + bucket * ERRF_BUCKET_SIZE) + mean)
+            {
+                errf_buckets[bucket] += 2;
+            }
+            else if(host_elements[i] <=
+                    sigma * (-ERRF_BUCKET_RANGE + bucket * ERRF_BUCKET_SIZE) + mean)
+            {
+                errf_buckets[bucket] += 1;
+            }
+        }
+    }
+
+    for(int bucket = 0; bucket < ERRF_NUM_BUCKETS; bucket++)
+    {
+        double expected_num_entries =
+            (std::erfc((ERRF_BUCKET_RANGE - bucket * ERRF_BUCKET_SIZE) / std::sqrt(2))) * 0.5 *
+            num_elements;
+        double noise_range = std::sqrt(expected_num_entries);
+        //     printf("Expected for bucket %d: %d. Actual: %d \n",
+        //            bucket,
+        //            static_cast<int>(expected_num_entries),
+        //            static_cast<int>(errf_buckets[bucket] / 2));
+        EXPECT_TRUE(errf_buckets[bucket] / 2 >= expected_num_entries - sig_tolerence * noise_range);
+        EXPECT_TRUE(errf_buckets[bucket] / 2 <= expected_num_entries + sig_tolerence * noise_range);
+    }
+}
+
+TEST(TDevIntegerRanUniGen, U8) { TDevRanUniGenInt<uint8_t>(0, 2, 15000); }
+TEST(TDevIntegerRanUniGen, U16) { TDevRanUniGenInt<uint16_t>(0, 100, 100000); }
+TEST(TDevIntegerRanUniGen, U32) { TDevRanUniGenInt<uint32_t>(0, 10000, 10000000); }
+TEST(TDevIntegerRanUniGen, I4) { TDevRanUniGenInt<ck::pk_i4_t>(-2, 2, 10000000); }
+
+TEST(TDevIntegerRanUniGen, F32) { TDevRanUniGenInt<float>(-2, 2, 10000000); }
+TEST(TDevIntegerRanUniGen, F16) { TDevRanUniGenInt<ck::half_t>(-2, 2, 1000000); }
+TEST(TDevIntegerRanUniGen, BF16) { TDevRanUniGenInt<ck::bhalf_t>(-2, 2, 1000000); }
+
+TEST(TDevFpRanUniGen, F32_1) { TDevRanUniGenFp<float>(0, 1, 100000); }
+TEST(TDevFpRanUniGen, F32_2) { TDevRanUniGenFp<float>(0, 37, 73000); }
+TEST(TDevFpRanUniGen, F32_3) { TDevRanUniGenFp<float>(-2, 1, 84000); }
+
+TEST(TDevFpRanUniGen, F16) { TDevRanUniGenFp<ck::half_t>(-1, 1, 100000); }
+TEST(TDevFpRanUniGen, BF16) { TDevRanUniGenFp<ck::bhalf_t>(0, 2, 100000); }
+TEST(TDevFpRanUniGen, F8) { TDevRanUniGenFp<ck::f8_t>(0, 2, 100000); }
+TEST(TDevFpRanUniGen, BF8) { TDevRanUniGenFp<ck::bf8_t>(-5, 5, 100000); }
+TEST(TDevFpRanUniGen, F4) { TDevRanUniGenFp<ck::f4x2_pk_t>(-5, 5, 100000, 20.0); }
+
+TEST(TDevRanNormGenFp, F32_1) { TDevRanNormGenFp<float>(1, 0, 1000000); }
+TEST(TDevRanNormGenFp, F32_2) { TDevRanNormGenFp<float>(5, -2, 10000000, 0.2, 5.0); }
+
+TEST(TDevRanNormGenFp, F16) { TDevRanNormGenFp<ck::half_t>(5, -2, 100000); }
+TEST(TDevRanNormGenFp, BF16) { TDevRanNormGenFp<ck::bhalf_t>(5, -2, 100000); }
+
+TEST(TDevRanNormGenFp, F8) { TDevRanNormGenFp<ck::f8_t>(2, 0, 100000, 0.5, 2.0, 10.0); }
+TEST(TDevRanNormGenFp, BF8) { TDevRanNormGenFp<ck::bf8_t>(16, 0, 100000, 0.5, 2.0, 30.0); }
+
+TEST(TDevRanNormGenFp, F4) { TDevRanNormGenFp<ck::f4x2_pk_t>(2, 0, 100000, 0.5, 3.0, 30.0); }