From 2fe4c8acec12236d16e738da4ab217cd077d6420 Mon Sep 17 00:00:00 2001
From: Johannes Graner <johannes.graner@amd.com>
Date: Mon, 15 Dec 2025 13:38:25 +0100
Subject: [PATCH] Add grouped convnd dataset tests for bwd_data, bwd_weight and
 make them parallel (#3380)

* Parallelization in dataset generation

* Parallelizable tests for fwd, bwd data, bwd weight with datasets

* .gitignore generated datasets

* Test parallelization script with round-robin GPU scheduling

* Parallelization updates to test generation and running

* Dataset paths relative to executable

* Update output from test generation

* Default to one GPU in test generation

* Add small dataset tests to Jenkins

* Update copyright lines

* Update test_data/generate_test_dataset.sh

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Move trap disable

* Common get path function

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

[ROCm/composable_kernel commit: fe35ba5dac168619462669192423ff40548d532d]
---
 .gitignore                                    |    5 +
 Jenkinsfile                                   |   10 +-
 test/common/csv_test_loader.hpp               |  246 ++++
 test/grouped_convnd_bwd_data/CMakeLists.txt   |    4 +
 ...st_grouped_convnd_bwd_data_dataset_xdl.cpp |  317 +++++
 test/grouped_convnd_bwd_weight/CMakeLists.txt |    4 +
 ..._grouped_convnd_bwd_weight_dataset_xdl.cpp |  258 ++++
 .../test_grouped_convnd_fwd_dataset_xdl.cpp   |  451 ++-----
 test_data/generate_test_dataset.sh            |  198 ++-
 test_data/gtest_parallel.py                   | 1187 +++++++++++++++++
 10 files changed, 2349 insertions(+), 331 deletions(-)
 create mode 100644 test/common/csv_test_loader.hpp
 create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
 create mode 100644 test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
 create mode 100644 test_data/gtest_parallel.py

diff --git a/.gitignore b/.gitignore
index d8468cf24e..98234268c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,11 @@ __pycache__/
 
 .cache/
 
+# Generated test data
+test_data/*
+!test_data/*.py
+!test_data/*.sh
+
 # Exceptions to build* patterns above
 # The experimental/builder directory should be tracked despite matching build*
 !experimental/builder
diff --git a/Jenkinsfile b/Jenkinsfile
index 5f03310cab..aea14c78b6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1476,15 +1476,19 @@ pipeline {
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ cd ../build && \
                                            ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 test_grouped_convnd_fwd_dataset_xdl && \
+                                           make -j64 test_grouped_convnd_fwd_dataset_xdl \
+                                           test_grouped_convnd_bwd_data_dataset_xdl \
+                                           test_grouped_convnd_bwd_weight_dataset_xdl && \
                                            cd ../test_data && \
                                            # Dataset generation modes:
                                            # - small: ~60 test cases (minimal, quick testing - 3 models, 2 batch sizes, 2 image sizes)
                                            # - half: ~300 test cases (moderate coverage - 16 models, 3 batch sizes, 5 image sizes), ~ 17 hours testing time
                                            # - full: ~600 test cases (comprehensive - 16 models, 5 batch sizes, 9 image sizes), ~ 40 hours testing time
-                                           ./generate_test_dataset.sh half && \
+                                           ./generate_test_dataset.sh small && \
                                            cd ../build && \
-                                           ./bin/test_grouped_convnd_fwd_dataset_xdl"""
+                                           ./bin/test_grouped_convnd_fwd_dataset_xdl && \
+                                           ./bin/test_grouped_convnd_bwd_data_dataset_xdl && \
+                                           ./bin/test_grouped_convnd_bwd_weight_dataset_xdl"""
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
diff --git a/test/common/csv_test_loader.hpp b/test/common/csv_test_loader.hpp
new file mode 100644
index 0000000000..78d3595f1a
--- /dev/null
+++ b/test/common/csv_test_loader.hpp
@@ -0,0 +1,246 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <fstream>
+#include <filesystem>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "ck/library/utility/convolution_parameter.hpp"
+
+namespace ck {
+namespace test {
+
+namespace fs = std::filesystem;
+
+// Helper function to find test_data directory relative to the test binary
+static std::string GetTestDataPath()
+{
+    // Get the path to the current executable
+    fs::path exe_path = fs::read_symlink("/proc/self/exe");
+
+    // Get the directory containing the executable
+    fs::path current_dir = exe_path.parent_path();
+
+    // Search for test_data directory by going up the directory tree
+    // This makes the code robust regardless of build directory depth
+    while(current_dir != current_dir.root_path())
+    {
+        fs::path test_data_path = current_dir / "test_data";
+        if(fs::exists(test_data_path) && fs::is_directory(test_data_path))
+        {
+            return test_data_path.string();
+        }
+        current_dir = current_dir.parent_path();
+    }
+
+    // If not found, return empty string
+    std::cerr << "ERROR: Could not find test_data directory relative to executable" << std::endl;
+    return "";
+}
+
+// CSV Reader Function for Loading Test Cases
+// Reads convolution parameters from CSV file and returns vector of ConvParam structures
+inline std::vector<ck::utils::conv::ConvParam> load_csv_test_cases(const std::string& filename)
+{
+    std::vector<ck::utils::conv::ConvParam> conv_params; // Return vector
+    std::ifstream file(filename);                        // Open CSV file
+
+    if(!file.is_open())
+    {
+        std::cerr << "ERROR: Cannot open CSV file: " << filename << std::endl;
+        return conv_params; // Return empty vector on error
+    }
+
+    std::string line;
+    int line_number = 0;
+
+    // Read file line by line
+    while(std::getline(file, line))
+    {
+        line_number++;
+        // Skip comment lines (starting with #) and empty lines
+        if(line.empty() || line[0] == '#')
+        {
+            continue;
+        }
+
+        // Skip header line (contains column names)
+        if(line.find("NDim,Groups,BatchSize") != std::string::npos)
+        {
+            continue;
+        }
+
+        // Parse CSV line using stringstream
+        std::stringstream ss(line);
+        std::string cell;
+        std::vector<std::string> row;
+
+        // Split line by commas
+        while(std::getline(ss, cell, ','))
+        {
+            row.push_back(cell);
+        }
+
+        // Validate row has correct number of columns
+        if(row.size() < 19)
+        { // Need at least 19 columns for 2D (excluding TestName)
+            std::cerr << "WARNING: Line " << line_number << " has insufficient columns ("
+                      << row.size() << "), skipping" << std::endl;
+            continue;
+        }
+
+        try
+        {
+            // Parse CSV data into ConvParam structure
+            // CSV Format:
+            // NDim,Groups,BatchSize,OutChannels,InChannels,KernelH,KernelW,InputH,InputW,OutputH,OutputW,StrideH,StrideW,DilationH,DilationW,LeftPadH,LeftPadW,RightPadH,RightPadW,TestName
+            int NDim        = std::stoi(row[0]);
+            int Groups      = std::stoi(row[1]);
+            int BatchSize   = std::stoi(row[2]);
+            int OutChannels = std::stoi(row[3]);
+            int InChannels  = std::stoi(row[4]);
+
+            if(NDim == 1)
+            {
+                // 1D Convolution: Need fewer columns for 1D parameters
+                if(row.size() < 13)
+                {
+                    std::cerr << "WARNING: 1D convolution on line " << line_number
+                              << " needs 13+ columns, has " << row.size() << ", skipping"
+                              << std::endl;
+                    continue;
+                }
+                // 1D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelW}, {InputW}, {StrideW}, {DilationW}, {LeftPadW}, {RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                 // NDim = 1
+                    Groups,               // Groups
+                    BatchSize,            // Batch size
+                    OutChannels,          // Output channels
+                    InChannels,           // Input channels
+                    {std::stoi(row[5])},  // Kernel: {W}
+                    {std::stoi(row[7])},  // Input: {W}
+                    {std::stoi(row[11])}, // Stride: {W}
+                    {std::stoi(row[13])}, // Dilation: {W}
+                    {std::stoi(row[15])}, // Left pad: {W}
+                    {std::stoi(row[17])}  // Right pad: {W}
+                };
+                conv_params.push_back(param);
+            }
+            else if(NDim == 2)
+            {
+                // 2D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelH,KernelW}, {InputH,InputW}, {StrideH,StrideW}, {DilationH,DilationW},
+                // {LeftPadH,LeftPadW}, {RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                     // NDim = 2
+                    Groups,                                   // Groups
+                    BatchSize,                                // Batch size
+                    OutChannels,                              // Output channels
+                    InChannels,                               // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6])},   // Kernel: {H, W}
+                    {std::stoi(row[7]), std::stoi(row[8])},   // Input: {H, W}
+                    {std::stoi(row[11]), std::stoi(row[12])}, // Stride: {H, W}
+                    {std::stoi(row[13]), std::stoi(row[14])}, // Dilation: {H, W}
+                    {std::stoi(row[15]), std::stoi(row[16])}, // Left pad: {H, W}
+                    {std::stoi(row[17]), std::stoi(row[18])}  // Right pad: {H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else if(NDim == 3)
+            {
+                // 3D Convolution: Need more columns for 3D parameters
+                if(row.size() < 26)
+                {
+                    std::cerr << "WARNING: 3D convolution on line " << line_number
+                              << " needs 26+ columns, has " << row.size() << ", skipping"
+                              << std::endl;
+                    continue;
+                }
+                // 3D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelD,KernelH,KernelW}, {InputD,InputH,InputW}, {OutputD,OutputH,OutputW},
+                // {StrideD,StrideH,StrideW}, {DilationD,DilationH,DilationW},
+                // {LeftPadD,LeftPadH,LeftPadW}, {RightPadD,RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                                       // NDim = 3
+                    Groups,                                                     // Groups
+                    BatchSize,                                                  // Batch size
+                    OutChannels,                                                // Output channels
+                    InChannels,                                                 // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6]), std::stoi(row[7])},  // Kernel: {D, H, W}
+                    {std::stoi(row[8]), std::stoi(row[9]), std::stoi(row[10])}, // Input: {D, H, W}
+                    {std::stoi(row[14]),
+                     std::stoi(row[15]),
+                     std::stoi(row[16])}, // Stride: {D, H, W}
+                    {std::stoi(row[17]),
+                     std::stoi(row[18]),
+                     std::stoi(row[19])}, // Dilation: {D, H, W}
+                    {std::stoi(row[20]),
+                     std::stoi(row[21]),
+                     std::stoi(row[22])}, // Left pad: {D, H, W}
+                    {std::stoi(row[23]),
+                     std::stoi(row[24]),
+                     std::stoi(row[25])} // Right pad: {D, H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else
+            {
+                std::cerr << "WARNING: Unsupported NDim=" << NDim << " on line " << line_number
+                          << ", skipping" << std::endl;
+            }
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "ERROR: Failed to parse line " << line_number << ": " << e.what()
+                      << std::endl;
+            continue;
+        }
+    }
+
+    file.close();
+    std::cout << "Loaded " << conv_params.size() << " test cases from " << filename << std::endl;
+    return conv_params;
+}
+
+// Helper function to load CSV test cases and populate conv_params vector
+// Returns true if loading succeeded, false otherwise
+inline bool load_and_populate_test_cases(const std::vector<std::string>& csv_paths,
+                                         std::vector<ck::utils::conv::ConvParam>& conv_params,
+                                         const std::string& dimension_label)
+{
+    for(const auto& csv_path : csv_paths)
+    {
+        auto csv_cases = load_csv_test_cases(csv_path);
+        if(!csv_cases.empty())
+        {
+            // Successfully loaded CSV data - add all test cases to conv_params
+            for(const auto& test_case : csv_cases)
+            {
+                conv_params.push_back(test_case);
+            }
+            std::cout << "Loaded " << csv_cases.size() << " " << dimension_label
+                      << " test cases from " << csv_path << std::endl;
+            return true;
+        }
+    }
+
+    // Failed to load from any path
+    std::cerr << "ERROR: Failed to load CSV test data from any of these locations:" << std::endl;
+    for(const auto& path : csv_paths)
+    {
+        std::cerr << "  - " << path << std::endl;
+    }
+    std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
+    std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets." << std::endl;
+
+    return false;
+}
+
+} // namespace test
+} // namespace ck
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 1da477ebb3..a9413bd25b 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -9,6 +9,10 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
     add_executable(test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_bwd_data_xdl_large_cases.cpp)
     target_compile_options(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+
+    add_executable(test_grouped_convnd_bwd_data_dataset_xdl test_grouped_convnd_bwd_data_dataset_xdl.cpp)
+    target_compile_options(test_grouped_convnd_bwd_data_dataset_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_bwd_data_dataset_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
 if(result EQUAL 0)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
new file mode 100644
index 0000000000..53b8ec32af
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
@@ -0,0 +1,317 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>          // Standard C library (exit codes, malloc)
+#include <iostream>         // C++ I/O streams (cout, cerr)
+#include <initializer_list> // C++ initializer list support (unused here)
+#include <vector>           // C++ vector container - stores test cases
+#include <string>           // String operations
+#include <gtest/gtest.h>    // Google Test framework - provides TEST_P, INSTANTIATE_TEST_SUITE_P
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp" // The actual GPU profiler that does convolution work
+#include "../common/csv_test_loader.hpp"                   // Shared CSV test case loader
+
+using namespace ck::tensor_layout::convolution; // Import tensor layout names (GNHWK, GKYXC, etc.)
+
+// Load CSV data for 2D tests
+static std::vector<ck::utils::conv::ConvParam> Get2DTestCases()
+{
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
+    {
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
+        {
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_2d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "2D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 2D test cases from " << csv_paths[0] << std::endl;
+        }
+    }
+    return test_cases;
+}
+
+// Load CSV data for 3D tests
+static std::vector<ck::utils::conv::ConvParam> Get3DTestCases()
+{
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
+    {
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
+        {
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_3d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "3D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 3D test cases from " << csv_paths[0] << std::endl;
+        }
+    }
+    return test_cases;
+}
+
+// Helper template to run a single backward data convolution test with split_k
+template <ck::index_t NDimSpatial,
+          typename OutLayout,
+          typename WeiLayout,
+          typename InLayout,
+          typename DataType>
+bool RunConvBwdDataTest(const ck::utils::conv::ConvParam& param, ck::index_t split_k)
+{
+    return ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                            OutLayout,
+                                                            WeiLayout,
+                                                            InLayout,
+                                                            DataType,
+                                                            DataType,
+                                                            DataType>(true,    // do_verification
+                                                                      1,       // init_method
+                                                                      false,   // do_log
+                                                                      false,   // time_kernel
+                                                                      param,   // ConvParam
+                                                                      split_k, // Split-K value
+                                                                      -1);     // instance_index
+}
+
+// 2D Tests - GNHWK layout - Float - SplitK=1
+class TestGroupedConvndBwdData2dGNHWKFloatSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKFloatSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, float>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKFloatSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - GNHWK layout - Float - SplitK=2
+class TestGroupedConvndBwdData2dGNHWKFloatSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKFloatSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, float>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKFloatSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - GNHWK layout - Half - SplitK=1
+class TestGroupedConvndBwdData2dGNHWKHalfSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKHalfSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, ck::half_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKHalfSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - GNHWK layout - Half - SplitK=2
+class TestGroupedConvndBwdData2dGNHWKHalfSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKHalfSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, ck::half_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKHalfSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - GNHWK layout - BFloat16 - SplitK=1
+class TestGroupedConvndBwdData2dGNHWKBFloat16SplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKBFloat16SplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, ck::bhalf_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKBFloat16SplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - GNHWK layout - BFloat16 - SplitK=2
+class TestGroupedConvndBwdData2dGNHWKBFloat16SplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dGNHWKBFloat16SplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, GNHWK, GKYXC, GNHWC, ck::bhalf_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dGNHWKBFloat16SplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Float - SplitK=1
+class TestGroupedConvndBwdData2dNHWGKFloatSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKFloatSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, float>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKFloatSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Float - SplitK=2
+class TestGroupedConvndBwdData2dNHWGKFloatSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKFloatSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, float>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKFloatSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Half - SplitK=1
+class TestGroupedConvndBwdData2dNHWGKHalfSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKHalfSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, ck::half_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKHalfSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Half - SplitK=2
+class TestGroupedConvndBwdData2dNHWGKHalfSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKHalfSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, ck::half_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKHalfSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - BFloat16 - SplitK=1
+class TestGroupedConvndBwdData2dNHWGKBFloat16SplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKBFloat16SplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, ck::bhalf_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKBFloat16SplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - BFloat16 - SplitK=2
+class TestGroupedConvndBwdData2dNHWGKBFloat16SplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData2dNHWGKBFloat16SplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<2, NHWGK, GKYXC, NHWGC, ck::bhalf_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData2dNHWGKBFloat16SplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 3D Tests - NDHWGK layout - Float - SplitK=1
+class TestGroupedConvndBwdData3dNDHWGKFloatSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKFloatSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, float>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKFloatSplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Float - SplitK=2
+class TestGroupedConvndBwdData3dNDHWGKFloatSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKFloatSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, float>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKFloatSplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Half - SplitK=1
+class TestGroupedConvndBwdData3dNDHWGKHalfSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKHalfSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, ck::half_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKHalfSplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Half - SplitK=2
+class TestGroupedConvndBwdData3dNDHWGKHalfSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKHalfSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, ck::half_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKHalfSplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - BFloat16 - SplitK=1
+class TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, ck::bhalf_t>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - BFloat16 - SplitK=2
+class TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdDataTest<3, NDHWGK, GKZYXC, NDHWGC, ck::bhalf_t>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdData3dNDHWGKBFloat16SplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index e46113bea0..7b994f5bb8 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -4,6 +4,10 @@
 if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
    add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance device_grouped_convnd_bwd_weight_instance)
+
+   add_executable(test_grouped_convnd_bwd_weight_dataset_xdl test_grouped_convnd_bwd_weight_dataset_xdl.cpp)
+   target_compile_options(test_grouped_convnd_bwd_weight_dataset_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+   target_link_libraries(test_grouped_convnd_bwd_weight_dataset_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance device_grouped_convnd_bwd_weight_instance)
 elseif(DL_KERNELS)
    add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
new file mode 100644
index 0000000000..aff6ba8873
--- /dev/null
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
@@ -0,0 +1,258 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>          // Standard C library (exit codes, malloc)
+#include <iostream>         // C++ I/O streams (cout, cerr)
+#include <initializer_list> // C++ initializer list support (unused here)
+#include <vector>           // C++ vector container - stores test cases
+#include <string>           // String operations
+#include <gtest/gtest.h>    // Google Test framework - provides TEST_P, INSTANTIATE_TEST_SUITE_P
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp" // The actual GPU profiler that does convolution work
+#include "../common/csv_test_loader.hpp"                     // Shared CSV test case loader
+
+using namespace ck::tensor_layout::convolution;
+
+// Load CSV data for 2D tests
+static std::vector<ck::utils::conv::ConvParam> Get2DTestCases()
+{
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
+    {
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
+        {
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_2d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "2D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 2D test cases from " << csv_paths[0] << std::endl;
+        }
+    }
+    return test_cases;
+}
+
+// Load CSV data for 3D tests
+static std::vector<ck::utils::conv::ConvParam> Get3DTestCases()
+{
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
+    {
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
+        {
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_3d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "3D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 3D test cases from " << csv_paths[0] << std::endl;
+        }
+    }
+    return test_cases;
+}
+
+// Helper template to run a single backward weight convolution test
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+bool RunConvBwdWeightTest(const ck::utils::conv::ConvParam& param, ck::index_t split_k)
+{
+    return ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
+                                                              InLayout,
+                                                              WeiLayout,
+                                                              OutLayout,
+                                                              InDataType,
+                                                              WeiDataType,
+                                                              OutDataType>(
+        true,                    // do_verification
+        1,                       // init_method
+        false,                   // do_log
+        false,                   // time_kernel
+        param,                   // ConvParam
+        std::to_string(split_k), // Split-K value as string
+        -1);                     // instance_index
+}
+
+// 2D Tests - NHWGK layout - Float - SplitK=1
+class TestGroupedConvndBwdWeight2dNHWGKFloatSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKFloatSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, float, float, float>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKFloatSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Float - SplitK=2
+class TestGroupedConvndBwdWeight2dNHWGKFloatSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKFloatSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, float, float, float>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKFloatSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Half - SplitK=1
+class TestGroupedConvndBwdWeight2dNHWGKHalfSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKHalfSplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, ck::half_t, ck::half_t, ck::half_t>(
+        GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKHalfSplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - Half - SplitK=2
+class TestGroupedConvndBwdWeight2dNHWGKHalfSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKHalfSplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, ck::half_t, ck::half_t, ck::half_t>(
+        GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKHalfSplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - BFloat16 - SplitK=1
+class TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, ck::bhalf_t, float, ck::bhalf_t>(
+        GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK1,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - NHWGK layout - BFloat16 - SplitK=2
+class TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<2, NHWGC, GKYXC, NHWGK, ck::bhalf_t, float, ck::bhalf_t>(
+        GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight2dNHWGKBFloat16SplitK2,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 3D Tests - NDHWGK layout - Float - SplitK=1
+class TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK1, ConvTest)
+{
+    EXPECT_TRUE(
+        (RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, float, float, float>(GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Float - SplitK=2
+class TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK2, ConvTest)
+{
+    EXPECT_TRUE(
+        (RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, float, float, float>(GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKFloatSplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Half - SplitK=1
+class TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK1, ConvTest)
+{
+    EXPECT_TRUE(
+        (RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t, ck::half_t, ck::half_t>(
+            GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - Half - SplitK=2
+class TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK2, ConvTest)
+{
+    EXPECT_TRUE(
+        (RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t, ck::half_t, ck::half_t>(
+            GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKHalfSplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - BFloat16 - SplitK=1
+class TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK1
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK1, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, ck::bhalf_t, float, ck::bhalf_t>(
+        GetParam(), 1)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK1,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - NDHWGK layout - BFloat16 - SplitK=2
+class TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK2
+    : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK2, ConvTest)
+{
+    EXPECT_TRUE((RunConvBwdWeightTest<3, NDHWGC, GKZYXC, NDHWGK, ck::bhalf_t, float, ck::bhalf_t>(
+        GetParam(), 2)));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndBwdWeight3dNDHWGKBFloat16SplitK2,
+                         ::testing::ValuesIn(Get3DTestCases()));
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
index 0928256817..c99f7ccf2f 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
@@ -5,330 +5,165 @@
 #include <iostream>         // C++ I/O streams (cout, cerr)
 #include <initializer_list> // C++ initializer list support (unused here)
 #include <vector>           // C++ vector container - stores test cases
-#include <fstream>          // File I/O for CSV reading
-#include <sstream>          // String stream for CSV parsing
 #include <string>           // String operations
-#include <gtest/gtest.h>    // Google Test framework - provides TYPED_TEST, EXPECT_TRUE
+#include <gtest/gtest.h>    // Google Test framework - provides TEST_P, INSTANTIATE_TEST_SUITE_P
 
 #include "profiler/profile_grouped_conv_fwd_impl.hpp" // The actual GPU profiler that does convolution work
-
-// CSV Reader Function for Loading Test Cases
-// Reads convolution parameters from CSV file and returns vector of ConvParam structures
-std::vector<ck::utils::conv::ConvParam> load_csv_test_cases(const std::string& filename)
-{
-    std::vector<ck::utils::conv::ConvParam> conv_params; // Return vector
-    std::ifstream file(filename);                        // Open CSV file
-
-    if(!file.is_open())
-    {
-        std::cerr << "ERROR: Cannot open CSV file: " << filename << std::endl;
-        return conv_params; // Return empty vector on error
-    }
-
-    std::string line;
-    int line_number = 0;
-
-    // Read file line by line
-    while(std::getline(file, line))
-    {
-        line_number++;
-        // Skip comment lines (starting with #) and empty lines
-        if(line.empty() || line[0] == '#')
-        {
-            continue;
-        }
-
-        // Skip header line (contains column names)
-        if(line.find("NDim,Groups,BatchSize") != std::string::npos)
-        {
-            continue;
-        }
-
-        // Parse CSV line using stringstream
-        std::stringstream ss(line);
-        std::string cell;
-        std::vector<std::string> row;
-
-        // Split line by commas
-        while(std::getline(ss, cell, ','))
-        {
-            row.push_back(cell);
-        }
-
-        // Validate row has correct number of columns
-        if(row.size() < 19)
-        { // Need at least 19 columns for 2D (excluding TestName)
-            std::cerr << "WARNING: Line " << line_number << " has insufficient columns ("
-                      << row.size() << "), skipping" << std::endl;
-            continue;
-        }
-
-        try
-        {
-            // Parse CSV data into ConvParam structure
-            // CSV Format:
-            // NDim,Groups,BatchSize,OutChannels,InChannels,KernelH,KernelW,InputH,InputW,OutputH,OutputW,StrideH,StrideW,DilationH,DilationW,LeftPadH,LeftPadW,RightPadH,RightPadW,TestName
-            int NDim        = std::stoi(row[0]);
-            int Groups      = std::stoi(row[1]);
-            int BatchSize   = std::stoi(row[2]);
-            int OutChannels = std::stoi(row[3]);
-            int InChannels  = std::stoi(row[4]);
-
-            if(NDim == 2)
-            {
-                // 2D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
-                // {KernelH,KernelW}, {InputH,InputW}, {StrideH,StrideW}, {DilationH,DilationW},
-                // {LeftPadH,LeftPadW}, {RightPadH,RightPadW}}
-                ck::utils::conv::ConvParam param = {
-                    NDim,                                     // NDim = 2
-                    Groups,                                   // Groups
-                    BatchSize,                                // Batch size
-                    OutChannels,                              // Output channels
-                    InChannels,                               // Input channels
-                    {std::stoi(row[5]), std::stoi(row[6])},   // Kernel: {H, W}
-                    {std::stoi(row[7]), std::stoi(row[8])},   // Input: {H, W}
-                    {std::stoi(row[11]), std::stoi(row[12])}, // Stride: {H, W}
-                    {std::stoi(row[13]), std::stoi(row[14])}, // Dilation: {H, W}
-                    {std::stoi(row[15]), std::stoi(row[16])}, // Left pad: {H, W}
-                    {std::stoi(row[17]), std::stoi(row[18])}  // Right pad: {H, W}
-                };
-                conv_params.push_back(param);
-            }
-            else if(NDim == 3)
-            {
-                // 3D Convolution: Need more columns for 3D parameters
-                if(row.size() < 26)
-                {
-                    std::cerr << "WARNING: 3D convolution on line " << line_number
-                              << " needs 26+ columns, has " << row.size() << ", skipping"
-                              << std::endl;
-                    continue;
-                }
-                // 3D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
-                // {KernelD,KernelH,KernelW}, {InputD,InputH,InputW}, {OutputD,OutputH,OutputW},
-                // {StrideD,StrideH,StrideW}, {DilationD,DilationH,DilationW},
-                // {LeftPadD,LeftPadH,LeftPadW}, {RightPadD,RightPadH,RightPadW}}
-                ck::utils::conv::ConvParam param = {
-                    NDim,                                                       // NDim = 3
-                    Groups,                                                     // Groups
-                    BatchSize,                                                  // Batch size
-                    OutChannels,                                                // Output channels
-                    InChannels,                                                 // Input channels
-                    {std::stoi(row[5]), std::stoi(row[6]), std::stoi(row[7])},  // Kernel: {D, H, W}
-                    {std::stoi(row[8]), std::stoi(row[9]), std::stoi(row[10])}, // Input: {D, H, W}
-                    {std::stoi(row[14]),
-                     std::stoi(row[15]),
-                     std::stoi(row[16])}, // Stride: {D, H, W}
-                    {std::stoi(row[17]),
-                     std::stoi(row[18]),
-                     std::stoi(row[19])}, // Dilation: {D, H, W}
-                    {std::stoi(row[20]),
-                     std::stoi(row[21]),
-                     std::stoi(row[22])}, // Left pad: {D, H, W}
-                    {std::stoi(row[23]),
-                     std::stoi(row[24]),
-                     std::stoi(row[25])} // Right pad: {D, H, W}
-                };
-                conv_params.push_back(param);
-            }
-            else
-            {
-                std::cerr << "WARNING: Unsupported NDim=" << NDim << " on line " << line_number
-                          << ", skipping" << std::endl;
-            }
-        }
-        catch(const std::exception& e)
-        {
-            std::cerr << "ERROR: Failed to parse line " << line_number << ": " << e.what()
-                      << std::endl;
-            continue;
-        }
-    }
-
-    file.close();
-    std::cout << "Loaded " << conv_params.size() << " test cases from " << filename << std::endl;
-    return conv_params;
-}
-
-// Template class that works with different data types and tensor layouts
-template <typename Tuple>
-class TestGroupedConvndFwd : public ::testing::Test // Inherit from Google Test base class
-{
-    protected:
-    using DataType =
-        std::tuple_element_t<0, Tuple>; // Extract data type from tuple (fp32, fp16, bf16, int8)
-    using InLayout =
-        std::tuple_element_t<1, Tuple>; // Extract input tensor layout (NHWGC, NDHWGC, etc.)
-    using WeiLayout =
-        std::tuple_element_t<2, Tuple>; // Extract weight tensor layout (GKYXC, GKZYXC, etc.)
-    using OutLayout =
-        std::tuple_element_t<3, Tuple>; // Extract output tensor layout (NHWGK, NDHWGK, etc.)
-    using IndexType = ck::long_index_t; // 64-bit integer type for tensor dimensions
-
-    // THE KEY CONTAINER: This stores all test case parameters
-    // Each test will push_back() ConvParam structures here
-    std::vector<ck::utils::conv::ConvParam> conv_params;
-
-    // Template function to run tests for N-dimensional spatial convolution (2D or 3D)
-    template <ck::index_t NDimSpatial>
-    void Run()
-    {
-        EXPECT_FALSE(conv_params.empty()); // Google Test assertion: ensure we have test cases
-        bool pass = true;                  // Track overall pass/fail across all test cases
-
-        // MAIN LOOP: Execute every test case that was added to conv_params
-        for(auto& param : conv_params)
-        {
-            // CALL THE ACTUAL GPU PROFILER - This is where convolution happens!
-            pass = pass &&
-                   ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                               InLayout,   // Input tensor layout
-                                                               WeiLayout,  // Weight tensor layout
-                                                               OutLayout,  // Output tensor layout
-                                                               DataType,   // Input data type
-                                                               DataType,   // Weight data type
-                                                               DataType,   // Output data type
-                                                               DataType,   // Accumulation type
-                                                               DataType,   // Bias type
-                                                               IndexType>( // Index type (int64)
-                       true, // do_verification: Compare GPU result with CPU reference
-                       1, // init_method: How to initialize random test data (1 = uniform -5 to 5)
-                       false,  // do_log: Don't print detailed tensor values
-                       false,  // time_kernel: Don't do performance timing (just correctness)
-                       param); // ConvParam: {NDim, Groups, Batch, OutChannels, InChannels,
-                               // KernelSize, InputSize, ...}
-        }
-        EXPECT_TRUE(pass); // Google Test assertion: ALL test cases must pass
-    }
-};
+#include "../common/csv_test_loader.hpp"              // Shared CSV test case loader
 
 using namespace ck::tensor_layout::convolution; // Import tensor layout names (NHWGC, GKYXC, etc.)
 
-// GOOGLE TEST TYPE COMBINATIONS: Define what data types and layouts to test
-// This creates 4 separate test instances for 2D convolution:
-using KernelTypes2d =
-    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,       // fp32 test
-                     std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,  // fp16 test
-                     std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>, // bfloat16 test
-                     std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;     // int8 test
-
-// This creates 3 separate test instances for 3D convolution (no int8 support for 3D):
-using KernelTypes3d =
-    ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,        // fp32 3D test
-                     std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,   // fp16 3D test
-                     std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>; // bfloat16 3D test
-
-// Create specialized test classes that inherit from the base template class
-template <typename Tuple>
-class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple> // 2D convolution test class
+// Load CSV data for 2D tests
+static std::vector<ck::utils::conv::ConvParam> Get2DTestCases()
 {
-};
-
-template <typename Tuple>
-class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple> // 3D convolution test class
-{
-};
-
-// GOOGLE TEST MAGIC: Create test suites
-// This tells Google Test to create 4 test instances for 2D (fp32, fp16, bf16, int8)
-TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
-// This tells Google Test to create 3 test instances for 3D (fp32, fp16, bf16)
-TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
-
-// THE ACTUAL 2D TEST - This runs 4 times (once for each data type: fp32, fp16, bf16, int8)
-TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
-{
-    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
-    // Try different locations for the CSV file (build directory vs source directory)
-    std::vector<std::string> csv_paths = {
-        "../test_data/conv_test_set_2d_dataset.csv", // From build directory to source
-    };
-
-    bool loaded = false;
-    for(const auto& csv_path : csv_paths)
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
     {
-        auto csv_cases = load_csv_test_cases(csv_path);
-        if(!csv_cases.empty())
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
         {
-            // Successfully loaded CSV data - add all test cases to conv_params
-            for(const auto& test_case : csv_cases)
-            {
-                this->conv_params.push_back(test_case);
-            }
-            std::cout << "Loaded " << csv_cases.size() << " 2D test cases from " << csv_path
-                      << std::endl;
-            loaded = true;
-            break;
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_2d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "2D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 2D test cases from " << csv_paths[0] << std::endl;
         }
     }
-
-    // FAIL if CSV loading fails - no fallback!
-    if(!loaded)
-    {
-        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
-                  << std::endl;
-        for(const auto& path : csv_paths)
-        {
-            std::cerr << "  - " << path << std::endl;
-        }
-        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
-        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
-                  << std::endl;
-
-        // Force test failure - no test cases means test should fail
-        EXPECT_TRUE(loaded) << "CSV test data loading failed";
-    }
-
-    // Execute all test cases with 2D convolution
-    // This calls Run<2>() which loops through conv_params and calls GPU profiler for each
-    this->template Run<2>();
+    return test_cases;
 }
 
-// THE ACTUAL 3D TEST - This runs 3 times (once for each data type: fp32, fp16, bf16)
-TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+// Load CSV data for 3D tests
+static std::vector<ck::utils::conv::ConvParam> Get3DTestCases()
 {
-    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
-    // Try different locations for the CSV file (build directory vs source directory)
-    std::vector<std::string> csv_paths = {
-        "../test_data/conv_test_set_3d_dataset.csv", // From build directory to source
-    };
-
-    bool loaded = false;
-    for(const auto& csv_path : csv_paths)
+    static std::vector<ck::utils::conv::ConvParam> test_cases;
+    if(test_cases.empty())
     {
-        auto csv_cases = load_csv_test_cases(csv_path);
-        if(!csv_cases.empty())
+        std::string test_data_dir = ck::test::GetTestDataPath();
+        if(test_data_dir.empty())
         {
-            // Successfully loaded CSV data - add all test cases to conv_params
-            for(const auto& test_case : csv_cases)
-            {
-                this->conv_params.push_back(test_case);
-            }
-            std::cout << "Loaded " << csv_cases.size() << " 3D test cases from " << csv_path
-                      << std::endl;
-            loaded = true;
-            break;
+            std::cerr << "FATAL: test_data directory not found" << std::endl;
+            return test_cases;
+        }
+
+        std::vector<std::string> csv_paths = {test_data_dir + "/conv_test_set_3d_dataset.csv"};
+        bool loaded = ck::test::load_and_populate_test_cases(csv_paths, test_cases, "3D");
+        if(!loaded)
+        {
+            std::cerr << "FATAL: Failed to load 3D test cases from " << csv_paths[0] << std::endl;
         }
     }
-
-    // FAIL if CSV loading fails - no fallback!
-    if(!loaded)
-    {
-        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
-                  << std::endl;
-        for(const auto& path : csv_paths)
-        {
-            std::cerr << "  - " << path << std::endl;
-        }
-        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
-        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
-                  << std::endl;
-
-        // Force test failure - no test cases means test should fail
-        EXPECT_TRUE(loaded) << "CSV test data loading failed";
-    }
-
-    // Execute all test cases with 3D convolution
-    // This calls Run<3>() which loops through conv_params and calls GPU profiler for each
-    this->template Run<3>();
+    return test_cases;
 }
+
+// Helper template to run a single convolution test
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DataType>
+bool RunConvTest(const ck::utils::conv::ConvParam& param)
+{
+    using IndexType = ck::long_index_t;
+    return ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                       InLayout,
+                                                       WeiLayout,
+                                                       OutLayout,
+                                                       DataType,
+                                                       DataType,
+                                                       DataType,
+                                                       DataType,
+                                                       DataType,
+                                                       IndexType>(true,  // do_verification
+                                                                  1,     // init_method
+                                                                  false, // do_log
+                                                                  false, // time_kernel
+                                                                  param);
+}
+
+// 2D Tests - Float
+class TestGroupedConvndFwd2dFloat : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd2dFloat, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<2, NHWGC, GKYXC, NHWGK, float>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd2dFloat,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - Half
+class TestGroupedConvndFwd2dHalf : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd2dHalf, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<2, NHWGC, GKYXC, NHWGK, ck::half_t>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd2dHalf,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - BFloat16
+class TestGroupedConvndFwd2dBFloat16 : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd2dBFloat16, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<2, NHWGC, GKYXC, NHWGK, ck::bhalf_t>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd2dBFloat16,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 2D Tests - Int8
+class TestGroupedConvndFwd2dInt8 : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd2dInt8, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<2, NHWGC, GKYXC, NHWGK, int8_t>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd2dInt8,
+                         ::testing::ValuesIn(Get2DTestCases()));
+
+// 3D Tests - Float
+class TestGroupedConvndFwd3dFloat : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd3dFloat, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<3, NDHWGC, GKZYXC, NDHWGK, float>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd3dFloat,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - Half
+class TestGroupedConvndFwd3dHalf : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd3dHalf, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<3, NDHWGC, GKZYXC, NDHWGK, ck::half_t>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd3dHalf,
+                         ::testing::ValuesIn(Get3DTestCases()));
+
+// 3D Tests - BFloat16
+class TestGroupedConvndFwd3dBFloat16 : public ::testing::TestWithParam<ck::utils::conv::ConvParam>
+{
+};
+TEST_P(TestGroupedConvndFwd3dBFloat16, ConvTest)
+{
+    EXPECT_TRUE((RunConvTest<3, NDHWGC, GKZYXC, NDHWGK, ck::bhalf_t>(GetParam())));
+}
+INSTANTIATE_TEST_SUITE_P(Dataset,
+                         TestGroupedConvndFwd3dBFloat16,
+                         ::testing::ValuesIn(Get3DTestCases()));
diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh
index e9c4937445..27f45a3bc7 100755
--- a/test_data/generate_test_dataset.sh
+++ b/test_data/generate_test_dataset.sh
@@ -8,6 +8,20 @@
 set -e  # Exit on error
 set +x  # Disable command echo (even if called with bash -x)
 
+# Trap to kill all background jobs on script exit/interruption
+cleanup() {
+    echo ""
+    echo "Cleaning up background processes..."
+    # Kill all jobs in the current process group
+    jobs -p | xargs -r kill 2>/dev/null || true
+    wait 2>/dev/null || true
+    echo "Cleanup complete."
+    exit 1
+}
+
+# Set up trap for common termination signals
+trap cleanup SIGINT SIGTERM EXIT
+
 echo "=========================================="
 echo "CK Convolution Test Dataset Generator"
 echo "=========================================="
@@ -18,7 +32,7 @@ if ! python3 -c "import torch" 2>/dev/null; then
     echo "PyTorch not found. Creating virtual environment..."
     
     # Create a virtual environment in the current directory
-    VENV_DIR="./pytorch_venv"
+    VENV_DIR="./.venv"
     if [ ! -d "$VENV_DIR" ]; then
         python3 -m venv $VENV_DIR || {
             echo "ERROR: Failed to create virtual environment."
@@ -66,11 +80,71 @@ if ! $PYTHON_CMD -c "import torch; import sys; sys.exit(0 if torch.cuda.is_avail
     echo "Continuing anyway to generate placeholder data..."
 fi
 
+# Parse command line arguments
+CONFIG_MODE="full"  # Default configuration mode: 'small', 'half' or 'full'
+MAX_PARALLEL_JOBS=1  # Default number of parallel jobs
+NUM_GPUS=1  # Number of GPUs to use (0 means no GPU assignment)
+
+# Process arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -j)
+            MAX_PARALLEL_JOBS="$2"
+            shift 2
+            ;;
+        -j*)
+            MAX_PARALLEL_JOBS="${1#-j}"
+            shift
+            ;;
+        --gpus)
+            NUM_GPUS="$2"
+            shift 2
+            ;;
+        small|half|full)
+            CONFIG_MODE="$1"
+            shift
+            ;;
+        *)
+            echo "Usage: $0 [small|half|full] [-j <num_jobs>] [--gpus <num_gpus>]"
+            echo "  Configuration modes: small, half, full (default: full)"
+            echo "  -j <num_jobs>: Number of parallel jobs (default: 1)"
+            echo "  --gpus <num_gpus>: Number of GPUs to use (e.g., 8 for GPUs 0-7)"
+            exit 1
+            ;;
+    esac
+done
+
+# Setup GPU array if GPUs are requested
+if [ $NUM_GPUS -gt 0 ]; then
+    # Auto-detect available GPUs
+    AVAILABLE_GPUS_COUNT=$(rocm-smi --showid 2>/dev/null | grep -oP 'GPU\[\K[0-9]+' | wc -l)
+    if [ "$AVAILABLE_GPUS_COUNT" -gt 0 ]; then
+        MAX_AVAILABLE=$AVAILABLE_GPUS_COUNT
+    else
+        MAX_AVAILABLE=0
+    fi
+    
+    # Validate requested GPU count
+    if [ $NUM_GPUS -gt $MAX_AVAILABLE ]; then
+        echo "WARNING: Requested $NUM_GPUS GPUs but only $MAX_AVAILABLE available. Using $MAX_AVAILABLE GPUs."
+        NUM_GPUS=$MAX_AVAILABLE
+    fi
+    
+    # Build GPU array (0 to NUM_GPUS-1)
+    GPU_ARRAY=()
+    for ((i=0; i<NUM_GPUS; i++)); do
+        GPU_ARRAY+=($i)
+    done
+    
+    echo "Using $NUM_GPUS GPU(s): ${GPU_ARRAY[*]}"
+else
+    echo "No GPU assignment specified, using default GPU behavior"
+    GPU_ARRAY=()
+fi
+
 # Configuration
 OUTPUT_DIR="generated_datasets"
 TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-# Get configuration mode from command line argument (default: full)
-CONFIG_MODE="${1:-full}"  # Configuration mode: 'small', 'half' or 'full'
 
 # Colors
 RED='\033[0;31m'
@@ -128,7 +202,8 @@ rocm-smi --showdriverversion || true
 echo ""
 echo "Step 2: Running 2D/3D models and capturing MIOpen commands"
 echo "-----------------------------------------"
-
+echo "Using up to $MAX_PARALLEL_JOBS parallel jobs"
+echo ""
 
 # Process 2D models from CSV configuration file
 echo "Processing 2D models from $OUTPUT_DIR/model_configs_2d.csv..."
@@ -140,6 +215,11 @@ CURRENT_CONFIG=0
 echo "Total configurations to process: $TOTAL_CONFIGS"
 echo ""
 
+# Array to track background job PIDs
+declare -a job_pids=()
+# Counter for round-robin GPU assignment
+GPU_COUNTER=0
+
 # Read 2D configurations from CSV (skip comments and header)
 while IFS=',' read -r config_name model batch_size channels height width precision; do
     # Skip comments and empty lines
@@ -150,21 +230,57 @@ while IFS=',' read -r config_name model batch_size channels height width precisi
     # Increment counter
     CURRENT_CONFIG=$((CURRENT_CONFIG + 1))
     
-    
     # Build configuration command
     CONFIG="--model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision"
     CONFIG_NAME="$config_name"
     
-    echo -e "${GREEN}[${CURRENT_CONFIG}/${TOTAL_CONFIGS}]${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC}"
+    # Assign GPU in round-robin fashion if GPUs are specified
+    if [ $NUM_GPUS -gt 0 ]; then
+        GPU_ID=${GPU_ARRAY[$((GPU_COUNTER % NUM_GPUS))]}
+        GPU_COUNTER=$((GPU_COUNTER + 1))
+        echo -e "${GREEN}[${CURRENT_CONFIG}/${TOTAL_CONFIGS}]${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC} ${PURPLE}[GPU ${GPU_ID}]${NC} - Starting in background"
+    else
+        GPU_ID=""
+        echo -e "${GREEN}[${CURRENT_CONFIG}/${TOTAL_CONFIGS}]${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC} - Starting in background"
+    fi
     
-    # Actual run with logging (suppress stdout, only capture stderr with MIOpen commands)
-    MIOPEN_ENABLE_LOGGING_CMD=1 $PYTHON_CMD run_model_with_miopen.py \
-        --model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision \
-        > /dev/null 2>> $OUTPUT_DIR/${model}_miopen_log_2d.txt || true 
-
+    # Run in background
+    (
+        # Set HIP_VISIBLE_DEVICES if GPU was assigned
+        if [ -n "$GPU_ID" ]; then
+            export HIP_VISIBLE_DEVICES=$GPU_ID
+        fi
+        
+        MIOPEN_ENABLE_LOGGING_CMD=1 $PYTHON_CMD run_model_with_miopen.py \
+            --model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision \
+            > /dev/null 2>> $OUTPUT_DIR/${model}_miopen_log_2d.txt || true
+        echo -e "${GREEN}[DONE]${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC}"
+    ) &
+    
+    job_pids+=($!)
+    
+    # Limit number of parallel jobs
+    if [ ${#job_pids[@]} -ge $MAX_PARALLEL_JOBS ]; then
+        # Wait for any job to complete
+        wait -n
+        # Remove completed jobs from array
+        for i in "${!job_pids[@]}"; do
+            if ! kill -0 "${job_pids[$i]}" 2>/dev/null; then
+                unset 'job_pids[$i]'
+            fi
+        done
+        job_pids=("${job_pids[@]}")  # Re-index array
+    fi
 
 done < $OUTPUT_DIR/model_configs_2d.csv
 
+# Wait for all remaining 2D jobs to complete
+echo "Waiting for remaining 2D jobs to complete..."
+wait
+
+echo "All 2D models processed!"
+echo ""
+
 # Process 3D models from CSV configuration file
 echo "Processing 3D models from $OUTPUT_DIR/model_configs_3d.csv..."
 
@@ -175,6 +291,10 @@ CURRENT_3D_CONFIG=0
 echo "Total 3D configurations to process: $TOTAL_3D_CONFIGS"
 echo ""
 
+# Reset job tracking array
+declare -a job_pids=()
+# GPU counter continues from 2D models for round-robin assignment
+
 # Read 3D configurations from CSV (skip comments and header)
 while IFS=',' read -r config_name model batch_size channels temporal_size height width precision; do
     # Skip comments and empty lines  
@@ -185,21 +305,59 @@ while IFS=',' read -r config_name model batch_size channels temporal_size height
     # Increment counter
     CURRENT_3D_CONFIG=$((CURRENT_3D_CONFIG + 1))
     
-
     # Build configuration command for 3D models
     CONFIG="--model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision"
     CONFIG_NAME="$config_name"
     
-    echo -e "${GREEN}[${CURRENT_3D_CONFIG}/${TOTAL_3D_CONFIGS}]${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC}"
+    # Assign GPU in round-robin fashion if GPUs are specified
+    if [ $NUM_GPUS -gt 0 ]; then
+        GPU_ID=${GPU_ARRAY[$((GPU_COUNTER % NUM_GPUS))]}
+        GPU_COUNTER=$((GPU_COUNTER + 1))
+        echo -e "${GREEN}[${CURRENT_3D_CONFIG}/${TOTAL_3D_CONFIGS}]${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC} ${PURPLE}[GPU ${GPU_ID}]${NC} - Starting in background"
+    else
+        GPU_ID=""
+        echo -e "${GREEN}[${CURRENT_3D_CONFIG}/${TOTAL_3D_CONFIGS}]${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC} - Starting in background"
+    fi
     
+    # Run in background
+    (
+        # Set HIP_VISIBLE_DEVICES if GPU was assigned
+        if [ -n "$GPU_ID" ]; then
+            export HIP_VISIBLE_DEVICES=$GPU_ID
+        fi
+        
+        MIOPEN_ENABLE_LOGGING_CMD=1 $PYTHON_CMD run_model_with_miopen.py \
+            --model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision \
+            > /dev/null 2>> $OUTPUT_DIR/${model}_miopen_log_3d.txt || true
+        echo -e "${GREEN}[DONE]${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC}"
+    ) &
     
-    # Actual run with logging (suppress stdout, only capture stderr with MIOpen commands)
-    MIOPEN_ENABLE_LOGGING_CMD=1 $PYTHON_CMD run_model_with_miopen.py \
-        --model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision \
-        > /dev/null 2>> $OUTPUT_DIR/${model}_miopen_log_3d.txt || true
+    job_pids+=($!)
+    
+    # Limit number of parallel jobs
+    if [ ${#job_pids[@]} -ge $MAX_PARALLEL_JOBS ]; then
+        # Wait for any job to complete
+        wait -n
+        # Remove completed jobs from array
+        for i in "${!job_pids[@]}"; do
+            if ! kill -0 "${job_pids[$i]}" 2>/dev/null; then
+                unset 'job_pids[$i]'
+            fi
+        done
+        job_pids=("${job_pids[@]}")  # Re-index array
+    fi
 
 done < $OUTPUT_DIR/model_configs_3d.csv
 
+# Wait for all remaining 3D jobs to complete
+echo "Waiting for remaining 3D jobs to complete..."
+wait
+
+echo "All 3D models processed!"
+echo ""
+
+# Disable trap on successful completion
+trap - SIGINT SIGTERM EXIT
 
 echo ""
 echo "Step 3: Converting MIOpen commands to CSV test cases"
@@ -311,7 +469,7 @@ if [ $COUNT_3D -gt 0 ]; then
 fi
 echo "  - Intermediate files in: $OUTPUT_DIR/"
 echo ""
-echo "To use these datasets:"
-echo "  1. Build the test: cd ../script && make -j64 test_grouped_convnd_fwd_dataset_xdl"
-echo "  2. Run the test: ./bin/test_grouped_convnd_fwd_dataset_xdl"
+echo "To use these datasets for direction (bwd_data, bwd_weight, or fwd):"
+echo "  1. Build the test: cd ../script && make -j64 test_grouped_convnd_<direction>_dataset_xdl"
+echo "  2. Run the test: ./bin/test_grouped_convnd_<direction>_dataset_xdl"
 echo ""
diff --git a/test_data/gtest_parallel.py b/test_data/gtest_parallel.py
new file mode 100644
index 0000000000..9ea9ee79b0
--- /dev/null
+++ b/test_data/gtest_parallel.py
@@ -0,0 +1,1187 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# This file has been modified to allow round-robin GPU scheduling.
+# Original file can be found at
+# https://github.com/google/gtest-parallel/blob/cd488bdedc1d2cffb98201a17afc1b298b0b90f1/gtest_parallel.py
+# Changes from the original file are subject to the following license:
+# SPDX-License-Identifier: MIT
+#
+# Copyright 2013 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+from functools import total_ordering
+import gzip
+import io
+import json
+import multiprocessing
+import optparse
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+
+if sys.version_info.major >= 3:
+    long = int
+    import _pickle as cPickle
+    import _thread as thread
+else:
+    import cPickle
+    import thread
+
+from pickle import HIGHEST_PROTOCOL as PICKLE_HIGHEST_PROTOCOL
+
+if sys.platform == "win32":
+    import msvcrt
+else:
+    import fcntl
+
+
+# An object that catches SIGINT sent to the Python process and notices
+# if processes passed to wait() die by SIGINT (we need to look for
+# both of those cases, because pressing Ctrl+C can result in either
+# the main process or one of the subprocesses getting the signal).
+#
+# Before a SIGINT is seen, wait(p) will simply call p.wait() and
+# return the result. Once a SIGINT has been seen (in the main process
+# or a subprocess, including the one the current call is waiting for),
+# wait(p) will call p.terminate() and raise ProcessWasInterrupted.
+class SigintHandler(object):
+    class ProcessWasInterrupted(Exception):
+        pass
+
+    sigint_returncodes = {
+        -signal.SIGINT,  # Unix
+        -1073741510,  # Windows
+    }
+
+    def __init__(self):
+        self.__lock = threading.Lock()
+        self.__processes = set()
+        self.__got_sigint = False
+        signal.signal(signal.SIGINT, lambda signal_num, frame: self.interrupt())
+
+    def __on_sigint(self):
+        self.__got_sigint = True
+        while self.__processes:
+            try:
+                self.__processes.pop().terminate()
+            except OSError:
+                pass
+
+    def interrupt(self):
+        with self.__lock:
+            self.__on_sigint()
+
+    def got_sigint(self):
+        with self.__lock:
+            return self.__got_sigint
+
+    def wait(self, p, timeout_per_test):
+        with self.__lock:
+            if self.__got_sigint:
+                p.terminate()
+            self.__processes.add(p)
+        try:
+            code = p.wait(timeout_per_test)
+        except subprocess.TimeoutExpired:
+            p.terminate()
+            self.__processes.remove(p)
+            code = -errno.ETIME
+        with self.__lock:
+            self.__processes.discard(p)
+            if code in self.sigint_returncodes:
+                self.__on_sigint()
+            if self.__got_sigint:
+                raise self.ProcessWasInterrupted
+        return code
+
+
+sigint_handler = SigintHandler()
+
+
+# Return the width of the terminal, or None if it couldn't be
+# determined (e.g. because we're not being run interactively).
+def term_width(out):
+    if not out.isatty():
+        return None
+    try:
+        p = subprocess.Popen(
+            ["stty", "size"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        (out, err) = p.communicate()
+        if p.returncode != 0 or err:
+            return None
+        return int(out.split()[1])
+    except (IndexError, OSError, ValueError):
+        return None
+
+
+# Output transient and permanent lines of text. If several transient
+# lines are written in sequence, the new will overwrite the old. We
+# use this to ensure that lots of unimportant info (tests passing)
+# won't drown out important info (tests failing).
+class Outputter(object):
+    def __init__(self, out_file):
+        self.__out_file = out_file
+        self.__previous_line_was_transient = False
+        self.__width = term_width(out_file)  # Line width, or None if not a tty.
+
+    def transient_line(self, msg):
+        if self.__width is None:
+            self.__out_file.write(msg + "\n")
+            self.__out_file.flush()
+        else:
+            self.__out_file.write("\r" + msg[: self.__width].ljust(self.__width))
+            self.__previous_line_was_transient = True
+
+    def flush_transient_output(self):
+        if self.__previous_line_was_transient:
+            self.__out_file.write("\n")
+            self.__previous_line_was_transient = False
+
+    def permanent_line(self, msg):
+        self.flush_transient_output()
+        self.__out_file.write(msg + "\n")
+        if self.__width is None:
+            self.__out_file.flush()
+
+
+def get_available_gpus(num_gpus):
+    """Get list of available GPU IDs based on HIP_VISIBLE_DEVICES and num_gpus.
+
+    Returns a list of GPU IDs to use.
+    If HIP_VISIBLE_DEVICES is set, we return the first min(num_gpus, len(HIP_VISIBLE_DEVICES)) GPU IDs from it.
+    If not set, we return GPU IDs 0 to num_gpus-1.
+    """
+    hip_visible = os.environ.get("HIP_VISIBLE_DEVICES", None)
+
+    # Treat empty string as not set
+    if hip_visible is not None and hip_visible.strip():
+        # Parse HIP_VISIBLE_DEVICES to get the list of available GPU IDs
+        try:
+            available_gpu_ids = [
+                gpu_id.strip() for gpu_id in hip_visible.split(",") if gpu_id.strip()
+            ]
+        except ValueError:
+            sys.stderr.write(
+                "Warning: Invalid HIP_VISIBLE_DEVICES format, using GPU 0\n"
+            )
+            return ["0"]
+
+        # If parsing resulted in empty list, treat as not set
+        if not available_gpu_ids:
+            return [str(i) for i in range(num_gpus)]
+
+        # Use the first min(num_gpus, len(available_gpu_ids)) GPUs from the list
+        num_to_use = min(num_gpus, len(available_gpu_ids))
+        return available_gpu_ids[:num_to_use]
+    else:
+        # If HIP_VISIBLE_DEVICES is not set or empty, use GPU IDs 0 to num_gpus-1
+        return [str(i) for i in range(num_gpus)]
+
+
+def get_save_file_path():
+    """Return path to file for saving transient data."""
+    if sys.platform == "win32":
+        default_cache_path = os.path.join(os.path.expanduser("~"), "AppData", "Local")
+        cache_path = os.environ.get("LOCALAPPDATA", default_cache_path)
+    else:
+        # We don't use xdg module since it's not a standard.
+        default_cache_path = os.path.join(os.path.expanduser("~"), ".cache")
+        cache_path = os.environ.get("XDG_CACHE_HOME", default_cache_path)
+
+    if os.path.isdir(cache_path):
+        return os.path.join(cache_path, "gtest-parallel")
+    else:
+        sys.stderr.write("Directory {} does not exist".format(cache_path))
+        return os.path.join(os.path.expanduser("~"), ".gtest-parallel-times")
+
+
+@total_ordering
+class Task(object):
+    """Stores information about a task (single execution of a test).
+
+    This class stores information about the test to be executed (gtest binary and
+    test name), and its result (log file, exit code and runtime).
+    Each task is uniquely identified by the gtest binary, the test name and an
+    execution number that increases each time the test is executed.
+    Additionaly we store the last execution time, so that next time the test is
+    executed, the slowest tests are run first.
+    """
+
+    def __init__(
+        self,
+        test_binary,
+        test_name,
+        test_command,
+        execution_number,
+        last_execution_time,
+        output_dir,
+    ):
+        self.test_name = test_name
+        self.output_dir = output_dir
+        self.test_binary = test_binary
+        self.test_command = test_command
+        self.execution_number = execution_number
+        self.last_execution_time = last_execution_time
+
+        self.exit_code = None
+        self.runtime_ms = None
+
+        self.test_id = (test_binary, test_name)
+        self.task_id = (test_binary, test_name, self.execution_number)
+
+        self.log_file = Task._logname(
+            self.output_dir, self.test_binary, test_name, self.execution_number
+        )
+
+    def __sorting_key(self):
+        # Unseen or failing tests (both missing execution time) take precedence over
+        # execution time. Tests are greater (seen as slower) when missing times so
+        # that they are executed first.
+        return (1 if self.last_execution_time is None else 0, self.last_execution_time)
+
+    def __eq__(self, other):
+        return self.__sorting_key() == other.__sorting_key()
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __lt__(self, other):
+        return self.__sorting_key() < other.__sorting_key()
+
+    @staticmethod
+    def _normalize(string):
+        return re.sub("[^A-Za-z0-9]", "_", string)
+
+    @staticmethod
+    def _logname(output_dir, test_binary, test_name, execution_number):
+        # Store logs to temporary files if there is no output_dir.
+        if output_dir is None:
+            (log_handle, log_name) = tempfile.mkstemp(
+                prefix="gtest_parallel_", suffix=".log"
+            )
+            os.close(log_handle)
+            return log_name
+
+        log_name = "%s-%s-%d.log" % (
+            Task._normalize(os.path.basename(test_binary)),
+            Task._normalize(test_name),
+            execution_number,
+        )
+
+        return os.path.join(output_dir, log_name)
+
+    def run(self, timeout_per_test, gpu_id=None):
+        begin = time.time()
+        with open(self.log_file, "w") as log:
+            # Set up environment with GPU assignment if specified
+            env = os.environ.copy()
+            if gpu_id is not None:
+                env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
+
+            # Get the absolute path to the test binary and its directory
+            # This handles both relative and absolute paths correctly
+            abs_test_binary = os.path.abspath(self.test_binary)
+            test_binary_dir = os.path.dirname(abs_test_binary)
+
+            # Update the test command to use the absolute path
+            abs_test_command = [abs_test_binary] + self.test_command[1:]
+
+            task = subprocess.Popen(
+                abs_test_command, stdout=log, stderr=log, env=env, cwd=test_binary_dir
+            )
+            try:
+                self.exit_code = sigint_handler.wait(task, timeout_per_test)
+            except sigint_handler.ProcessWasInterrupted:
+                thread.exit()
+        self.runtime_ms = int(1000 * (time.time() - begin))
+        self.last_execution_time = None if self.exit_code else self.runtime_ms
+
+
+class TaskManager(object):
+    """Executes the tasks and stores the passed, failed and interrupted tasks.
+
+    When a task is run, this class keeps track if it passed, failed or was
+    interrupted. After a task finishes it calls the relevant functions of the
+    Logger, TestResults and TestTimes classes, and in case of failure, retries the
+    test as specified by the --retry_failed flag.
+    """
+
+    def __init__(
+        self,
+        times,
+        logger,
+        test_results,
+        task_factory,
+        times_to_retry,
+        initial_execution_number,
+    ):
+        self.times = times
+        self.logger = logger
+        self.test_results = test_results
+        self.task_factory = task_factory
+        self.times_to_retry = times_to_retry
+        self.initial_execution_number = initial_execution_number
+
+        self.global_exit_code = 0
+
+        self.passed = []
+        self.failed = []
+        self.started = {}
+        self.timed_out = []
+        self.execution_number = {}
+
+        self.lock = threading.Lock()
+
+    def __get_next_execution_number(self, test_id):
+        with self.lock:
+            next_execution_number = self.execution_number.setdefault(
+                test_id, self.initial_execution_number
+            )
+            self.execution_number[test_id] += 1
+        return next_execution_number
+
+    def __register_start(self, task):
+        with self.lock:
+            self.started[task.task_id] = task
+
+    def register_exit(self, task):
+        self.logger.log_exit(task)
+        self.times.record_test_time(
+            task.test_binary, task.test_name, task.last_execution_time
+        )
+        if self.test_results:
+            self.test_results.log(
+                task.test_name, task.runtime_ms / 1000.0, task.exit_code
+            )
+
+        with self.lock:
+            self.started.pop(task.task_id)
+            if task.exit_code == 0:
+                self.passed.append(task)
+            elif task.exit_code == -errno.ETIME:
+                self.timed_out.append(task)
+            else:
+                self.failed.append(task)
+
+    def run_task(self, task, timeout_per_test, gpu_id=None):
+        for try_number in range(self.times_to_retry + 1):
+            self.__register_start(task)
+            task.run(timeout_per_test, gpu_id)
+            self.register_exit(task)
+
+            if task.exit_code == 0:
+                break
+
+            if try_number < self.times_to_retry:
+                execution_number = self.__get_next_execution_number(task.test_id)
+                # We need create a new Task instance. Each task represents a single test
+                # execution, with its own runtime, exit code and log file.
+                task = self.task_factory(
+                    task.test_binary,
+                    task.test_name,
+                    task.test_command,
+                    execution_number,
+                    task.last_execution_time,
+                    task.output_dir,
+                )
+
+        with self.lock:
+            if task.exit_code != 0:
+                self.global_exit_code = task.exit_code
+
+
+class FilterFormat(object):
+    def __init__(self, output_dir):
+        if sys.stdout.isatty():
+            # stdout needs to be unbuffered since the output is interactive.
+            if isinstance(sys.stdout, io.TextIOWrapper):
+                # workaround for https://bugs.python.org/issue17404
+                sys.stdout = io.TextIOWrapper(
+                    sys.stdout.detach(),
+                    line_buffering=True,
+                    write_through=True,
+                    newline="\n",
+                )
+            else:
+                sys.stdout = os.fdopen(sys.stdout.fileno(), "w", 0)
+
+        self.output_dir = output_dir
+
+        self.total_tasks = 0
+        self.finished_tasks = 0
+        self.out = Outputter(sys.stdout)
+        self.stdout_lock = threading.Lock()
+
+    def move_to(self, destination_dir, tasks):
+        if self.output_dir is None:
+            return
+
+        destination_dir = os.path.join(self.output_dir, destination_dir)
+        os.makedirs(destination_dir)
+        for task in tasks:
+            shutil.move(task.log_file, destination_dir)
+
+    def print_tests(self, message, tasks, print_try_number, print_test_command):
+        self.out.permanent_line("%s (%s/%s):" % (message, len(tasks), self.total_tasks))
+        for task in sorted(tasks):
+            runtime_ms = "Interrupted"
+            if task.runtime_ms is not None:
+                runtime_ms = "%d ms" % task.runtime_ms
+            if print_test_command:
+                try:
+                    cmd_str = " ".join(task.test_command)
+                except TypeError:
+                    cmd_str = task.test_command
+                self.out.permanent_line(
+                    "%11s: %s%s"
+                    % (
+                        runtime_ms,
+                        cmd_str,
+                        (" (try #%d)" % task.execution_number)
+                        if print_try_number
+                        else "",
+                    )
+                )
+            else:
+                self.out.permanent_line(
+                    "%11s: %s %s%s"
+                    % (
+                        runtime_ms,
+                        task.test_binary,
+                        task.test_name,
+                        (" (try #%d)" % task.execution_number)
+                        if print_try_number
+                        else "",
+                    )
+                )
+
+    def log_exit(self, task):
+        with self.stdout_lock:
+            self.finished_tasks += 1
+            self.out.transient_line(
+                "[%d/%d] %s (%d ms)"
+                % (
+                    self.finished_tasks,
+                    self.total_tasks,
+                    task.test_name,
+                    task.runtime_ms,
+                )
+            )
+            if task.exit_code != 0:
+                signal_name = None
+                if task.exit_code < 0:
+                    try:
+                        signal_name = signal.Signals(-task.exit_code).name
+                    except ValueError:
+                        pass
+
+                with open(task.log_file) as f:
+                    for line in f.readlines():
+                        self.out.permanent_line(line.rstrip())
+                if task.exit_code is None:
+                    self.out.permanent_line(
+                        "[%d/%d] %s aborted after %d ms"
+                        % (
+                            self.finished_tasks,
+                            self.total_tasks,
+                            task.test_name,
+                            task.runtime_ms,
+                        )
+                    )
+                elif task.exit_code == -errno.ETIME:
+                    self.out.permanent_line(
+                        "\033[31m[  TIMEOUT ]\033[0m %s timed out after %d s"
+                        % (task.test_name, task.runtime_ms / 1000)
+                    )
+                elif signal_name is not None:
+                    self.out.permanent_line(
+                        "[%d/%d] %s killed by signal %s (%d ms)"
+                        % (
+                            self.finished_tasks,
+                            self.total_tasks,
+                            task.test_name,
+                            signal_name,
+                            task.runtime_ms,
+                        )
+                    )
+                else:
+                    self.out.permanent_line(
+                        "[%d/%d] %s returned with exit code %d (%d ms)"
+                        % (
+                            self.finished_tasks,
+                            self.total_tasks,
+                            task.test_name,
+                            task.exit_code,
+                            task.runtime_ms,
+                        )
+                    )
+
+        if self.output_dir is None:
+            # Try to remove the file 100 times (sleeping for 0.1 second in between).
+            # This is a workaround for a process handle seemingly holding on to the
+            # file for too long inside os.subprocess. This workaround is in place
+            # until we figure out a minimal repro to report upstream (or a better
+            # suspect) to prevent os.remove exceptions.
+            num_tries = 100
+            for i in range(num_tries):
+                try:
+                    os.remove(task.log_file)
+                except OSError as e:
+                    if e.errno is not errno.ENOENT:
+                        if i is num_tries - 1:
+                            self.out.permanent_line(
+                                "Could not remove temporary log file: " + str(e)
+                            )
+                        else:
+                            time.sleep(0.1)
+                        continue
+                break
+
+    def log_tasks(self, total_tasks):
+        self.total_tasks += total_tasks
+        self.out.transient_line("[0/%d] Running tests..." % self.total_tasks)
+
+    def summarize(self, passed_tasks, failed_tasks, interrupted_tasks):
+        stats = {}
+
+        def add_stats(stats, task, idx):
+            task_key = (task.test_binary, task.test_name)
+            if task_key not in stats:
+                # (passed, failed, interrupted) task_key is added as tie breaker to get
+                # alphabetic sorting on equally-stable tests
+                stats[task_key] = [0, 0, 0, task_key]
+            stats[task_key][idx] += 1
+
+        for task in passed_tasks:
+            add_stats(stats, task, 0)
+        for task in failed_tasks:
+            add_stats(stats, task, 1)
+        for task in interrupted_tasks:
+            add_stats(stats, task, 2)
+
+        self.out.permanent_line("SUMMARY:")
+        for task_key in sorted(stats, key=stats.__getitem__):
+            (num_passed, num_failed, num_interrupted, _) = stats[task_key]
+            (test_binary, task_name) = task_key
+            total_runs = num_passed + num_failed + num_interrupted
+            if num_passed == total_runs:
+                continue
+            self.out.permanent_line(
+                "  %s %s passed %d / %d times%s."
+                % (
+                    test_binary,
+                    task_name,
+                    num_passed,
+                    total_runs,
+                    ""
+                    if num_interrupted == 0
+                    else (" (%d interrupted)" % num_interrupted),
+                )
+            )
+
+    def flush(self):
+        self.out.flush_transient_output()
+
+
+class CollectTestResults(object):
+    def __init__(self, json_dump_filepath):
+        self.test_results_lock = threading.Lock()
+        self.json_dump_file = open(json_dump_filepath, "w")
+        self.test_results = {
+            "interrupted": False,
+            "path_delimiter": ".",
+            # Third version of the file format. See the link in the flag description
+            # for details.
+            "version": 3,
+            "seconds_since_epoch": int(time.time()),
+            "num_failures_by_type": {
+                "PASS": 0,
+                "FAIL": 0,
+                "TIMEOUT": 0,
+            },
+            "tests": {},
+        }
+
+    def log(self, test, runtime_seconds, exit_code):
+        if exit_code is None:
+            actual_result = "TIMEOUT"
+        elif exit_code == 0:
+            actual_result = "PASS"
+        else:
+            actual_result = "FAIL"
+        with self.test_results_lock:
+            self.test_results["num_failures_by_type"][actual_result] += 1
+            results = self.test_results["tests"]
+            for name in test.split("."):
+                results = results.setdefault(name, {})
+
+            if results:
+                results["actual"] += " " + actual_result
+                results["times"].append(runtime_seconds)
+            else:  # This is the first invocation of the test
+                results["actual"] = actual_result
+                results["times"] = [runtime_seconds]
+                results["time"] = runtime_seconds
+                results["expected"] = "PASS"
+
+    def dump_to_file_and_close(self):
+        json.dump(self.test_results, self.json_dump_file)
+        self.json_dump_file.close()
+
+
+# Record of test runtimes. Has built-in locking.
+class TestTimes(object):
+    class LockedFile(object):
+        def __init__(self, filename, mode):
+            self._filename = filename
+            self._mode = mode
+            self._fo = None
+
+        def __enter__(self):
+            self._fo = open(self._filename, self._mode)
+
+            # Regardless of opening mode we always seek to the beginning of file.
+            # This simplifies code working with LockedFile and also ensures that
+            # we lock (and unlock below) always the same region in file on win32.
+            self._fo.seek(0)
+
+            try:
+                if sys.platform == "win32":
+                    # We are locking here fixed location in file to use it as
+                    # an exclusive lock on entire file.
+                    msvcrt.locking(self._fo.fileno(), msvcrt.LK_LOCK, 1)
+                else:
+                    fcntl.flock(self._fo.fileno(), fcntl.LOCK_EX)
+            except IOError:
+                self._fo.close()
+                raise
+
+            return self._fo
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            # Flush any buffered data to disk. This is needed to prevent race
+            # condition which happens from the moment of releasing file lock
+            # till closing the file.
+            self._fo.flush()
+
+            try:
+                if sys.platform == "win32":
+                    self._fo.seek(0)
+                    msvcrt.locking(self._fo.fileno(), msvcrt.LK_UNLCK, 1)
+                else:
+                    fcntl.flock(self._fo.fileno(), fcntl.LOCK_UN)
+            finally:
+                self._fo.close()
+
+            return exc_value is None
+
+    def __init__(self, save_file):
+        "Create new object seeded with saved test times from the given file."
+        self.__times = {}  # (test binary, test name) -> runtime in ms
+
+        # Protects calls to record_test_time(); other calls are not
+        # expected to be made concurrently.
+        self.__lock = threading.Lock()
+
+        try:
+            with TestTimes.LockedFile(save_file, "rb") as fd:
+                times = TestTimes.__read_test_times_file(fd)
+        except IOError:
+            # We couldn't obtain the lock.
+            return
+
+        # Discard saved times if the format isn't right.
+        if type(times) is not dict:
+            return
+        for (test_binary, test_name), runtime in times.items():
+            if (
+                type(test_binary) is not str
+                or type(test_name) is not str
+                or type(runtime) not in {int, long, type(None)}
+            ):
+                return
+
+        self.__times = times
+
+    def get_test_time(self, binary, testname):
+        """Return the last duration for the given test as an integer number of
+        milliseconds, or None if the test failed or if there's no record for it."""
+        return self.__times.get((binary, testname), None)
+
+    def record_test_time(self, binary, testname, runtime_ms):
+        """Record that the given test ran in the specified number of
+        milliseconds. If the test failed, runtime_ms should be None."""
+        with self.__lock:
+            self.__times[(binary, testname)] = runtime_ms
+
+    def write_to_file(self, save_file):
+        "Write all the times to file."
+        try:
+            with TestTimes.LockedFile(save_file, "a+b") as fd:
+                times = TestTimes.__read_test_times_file(fd)
+
+                if times is None:
+                    times = self.__times
+                else:
+                    times.update(self.__times)
+
+                # We erase data from file while still holding a lock to it. This
+                # way reading old test times and appending new ones are atomic
+                # for external viewer.
+                fd.seek(0)
+                fd.truncate()
+                with gzip.GzipFile(fileobj=fd, mode="wb") as gzf:
+                    cPickle.dump(times, gzf, PICKLE_HIGHEST_PROTOCOL)
+        except IOError:
+            pass  # ignore errors---saving the times isn't that important
+
+    @staticmethod
+    def __read_test_times_file(fd):
+        try:
+            with gzip.GzipFile(fileobj=fd, mode="rb") as gzf:
+                times = cPickle.load(gzf)
+        except Exception:
+            # File doesn't exist, isn't readable, is malformed---whatever.
+            # Just ignore it.
+            return None
+        else:
+            return times
+
+
+def find_tests(binaries, additional_args, options, times):
+    test_count = 0
+    tasks = []
+    for test_binary in binaries:
+        command = [test_binary] + additional_args
+        if options.gtest_also_run_disabled_tests:
+            command += ["--gtest_also_run_disabled_tests"]
+
+        list_command = command + ["--gtest_list_tests"]
+        if options.gtest_filter != "":
+            list_command += ["--gtest_filter=" + options.gtest_filter]
+
+        # Get absolute path and directory for the test binary
+        abs_test_binary = os.path.abspath(test_binary)
+        test_binary_dir = os.path.dirname(abs_test_binary)
+
+        # Create list command with absolute path
+        abs_list_command = [abs_test_binary] + additional_args + ["--gtest_list_tests"]
+        if options.gtest_also_run_disabled_tests:
+            abs_list_command += ["--gtest_also_run_disabled_tests"]
+        if options.gtest_filter != "":
+            abs_list_command += ["--gtest_filter=" + options.gtest_filter]
+
+        try:
+            # Run the list command from the binary's directory so relative paths work
+            test_list = subprocess.check_output(
+                abs_list_command, stderr=subprocess.STDOUT, cwd=test_binary_dir
+            )
+        except subprocess.CalledProcessError as e:
+            sys.exit("%s: %s\n%s" % (test_binary, str(e), e.output))
+
+        try:
+            test_list = test_list.split("\n")
+        except TypeError:
+            # subprocess.check_output() returns bytes in python3
+            test_list = test_list.decode(sys.stdout.encoding).split("\n")
+
+        command += ["--gtest_color=" + options.gtest_color]
+
+        test_group = ""
+        for line in test_list:
+            if not line.strip():
+                continue
+            if line[0] != " ":
+                # Remove comments for typed tests and strip whitespace.
+                test_group = line.split("#")[0].strip()
+                continue
+            # Remove comments for parameterized tests and strip whitespace.
+            line = line.split("#")[0].strip()
+            if not line:
+                continue
+
+            test_name = test_group + line
+            if not options.gtest_also_run_disabled_tests and "DISABLED_" in test_name:
+                continue
+
+            # Skip PRE_ tests which are used by Chromium.
+            if ".PRE_" in test_name:
+                continue
+
+            last_execution_time = times.get_test_time(test_binary, test_name)
+            if options.failed and last_execution_time is not None:
+                continue
+
+            test_command = command + ["--gtest_filter=" + test_name]
+            if (test_count - options.shard_index) % options.shard_count == 0:
+                for execution_number in range(options.repeat):
+                    tasks.append(
+                        Task(
+                            test_binary,
+                            test_name,
+                            test_command,
+                            execution_number + 1,
+                            last_execution_time,
+                            options.output_dir,
+                        )
+                    )
+
+            test_count += 1
+
+    # Sort the tasks to run the slowest tests first, so that faster ones can be
+    # finished in parallel.
+    return sorted(tasks, reverse=True)
+
+
+def execute_tasks(
+    tasks,
+    pool_size,
+    task_manager,
+    timeout_seconds,
+    timeout_per_test,
+    serialize_test_cases,
+    available_gpus=None,
+):
+    class WorkerFn(object):
+        def __init__(self, tasks, running_groups, timeout_per_test, available_gpus):
+            self.tasks = tasks
+            self.running_groups = running_groups
+            self.timeout_per_test = timeout_per_test
+            self.available_gpus = available_gpus
+            self.task_lock = threading.Lock()
+            self.task_counter = 0
+
+        def __call__(self):
+            while True:
+                gpu_id = None
+                with self.task_lock:
+                    for task_id in range(len(self.tasks)):
+                        task = self.tasks[task_id]
+
+                        if self.running_groups is not None:
+                            test_group = task.test_name.split(".")[0]
+                            if test_group in self.running_groups:
+                                # Try to find other non-running test group.
+                                continue
+                            else:
+                                self.running_groups.add(test_group)
+
+                        # Assign GPU in round-robin fashion if GPUs are available
+                        if self.available_gpus:
+                            gpu_id = self.available_gpus[
+                                self.task_counter % len(self.available_gpus)
+                            ]
+                            self.task_counter += 1
+
+                        del self.tasks[task_id]
+                        break
+                    else:
+                        # Either there is no tasks left or number or remaining test
+                        # cases (groups) is less than number or running threads.
+                        return
+
+                task_manager.run_task(task, self.timeout_per_test, gpu_id)
+
+                if self.running_groups is not None:
+                    with self.task_lock:
+                        self.running_groups.remove(test_group)
+
+    def start_daemon(func):
+        t = threading.Thread(target=func)
+        t.daemon = True
+        t.start()
+        return t
+
+    timeout = None
+    try:
+        if timeout_seconds:
+            timeout = threading.Timer(timeout_seconds, sigint_handler.interrupt)
+            timeout.start()
+        running_groups = set() if serialize_test_cases else None
+        worker_fn = WorkerFn(tasks, running_groups, timeout_per_test, available_gpus)
+        workers = [start_daemon(worker_fn) for _ in range(pool_size)]
+        for worker in workers:
+            worker.join()
+    finally:
+        if timeout:
+            timeout.cancel()
+            for task in list(task_manager.started.values()):
+                task.runtime_ms = timeout_seconds * 1000
+                task_manager.register_exit(task)
+
+
+def default_options_parser():
+    parser = optparse.OptionParser(
+        usage="usage: %prog [options] binary [binary ...] -- [additional args]"
+    )
+
+    parser.add_option(
+        "-d",
+        "--output_dir",
+        type="string",
+        default=None,
+        help="Output directory for test logs. Logs will be "
+        "available under gtest-parallel-logs/, so "
+        "--output_dir=/tmp will results in all logs being "
+        "available under /tmp/gtest-parallel-logs/.",
+    )
+    parser.add_option(
+        "-r",
+        "--repeat",
+        type="int",
+        default=1,
+        help="Number of times to execute all the tests.",
+    )
+    parser.add_option(
+        "--retry_failed",
+        type="int",
+        default=0,
+        help="Number of times to repeat failed tests.",
+    )
+    parser.add_option(
+        "--failed",
+        action="store_true",
+        default=False,
+        help="run only failed and new tests",
+    )
+    parser.add_option(
+        "-w",
+        "--workers",
+        type="int",
+        default=multiprocessing.cpu_count(),
+        help="number of workers to spawn",
+    )
+    parser.add_option(
+        "--gpus",
+        type="int",
+        default=1,
+        help="number of GPUs to use for parallel execution (default: 1)",
+    )
+    parser.add_option(
+        "--gtest_color", type="string", default="yes", help="color output"
+    )
+    parser.add_option("--gtest_filter", type="string", default="", help="test filter")
+    parser.add_option(
+        "--gtest_also_run_disabled_tests",
+        action="store_true",
+        default=False,
+        help="run disabled tests too",
+    )
+    parser.add_option(
+        "--print_test_times",
+        action="store_true",
+        default=False,
+        help="list the run time of each test at the end of execution",
+    )
+    parser.add_option(
+        "--print_test_command",
+        action="store_true",
+        default=False,
+        help="Print full test command instead of name",
+    )
+    parser.add_option(
+        "--shard_count",
+        type="int",
+        default=1,
+        help="total number of shards (for sharding test execution "
+        "between multiple machines)",
+    )
+    parser.add_option(
+        "--shard_index",
+        type="int",
+        default=0,
+        help="zero-indexed number identifying this shard (for "
+        "sharding test execution between multiple machines)",
+    )
+    parser.add_option(
+        "--dump_json_test_results",
+        type="string",
+        default=None,
+        help="Saves the results of the tests as a JSON machine-"
+        "readable file. The format of the file is specified at "
+        "https://www.chromium.org/developers/the-json-test-results-format",
+    )
+    parser.add_option(
+        "--timeout",
+        type="int",
+        default=None,
+        help="Interrupt all remaining processes after the given time (in seconds).",
+    )
+    parser.add_option(
+        "--timeout_per_test",
+        type="int",
+        default=None,
+        help="Interrupt single processes after the given time (in seconds).",
+    )
+    parser.add_option(
+        "--serialize_test_cases",
+        action="store_true",
+        default=False,
+        help="Do not run tests from the same test case in parallel.",
+    )
+    return parser
+
+
+def main():
+    # Remove additional arguments (anything after --).
+    additional_args = []
+
+    for i in range(len(sys.argv)):
+        if sys.argv[i] == "--":
+            additional_args = sys.argv[i + 1 :]
+            sys.argv = sys.argv[:i]
+            break
+
+    parser = default_options_parser()
+    (options, binaries) = parser.parse_args()
+
+    if options.output_dir is not None and not os.path.isdir(options.output_dir):
+        parser.error(
+            "--output_dir value must be an existing directory, "
+            'current value is "%s"' % options.output_dir
+        )
+
+    # Append gtest-parallel-logs to log output, this is to avoid deleting user
+    # data if an user passes a directory where files are already present. If a
+    # user specifies --output_dir=Docs/, we'll create Docs/gtest-parallel-logs
+    # and clean that directory out on startup, instead of nuking Docs/.
+    if options.output_dir:
+        options.output_dir = os.path.join(options.output_dir, "gtest-parallel-logs")
+
+    if binaries == []:
+        parser.print_usage()
+        sys.exit(1)
+
+    if options.shard_count < 1:
+        parser.error(
+            "Invalid number of shards: %d. Must be at least 1." % options.shard_count
+        )
+    if not (0 <= options.shard_index < options.shard_count):
+        parser.error(
+            "Invalid shard index: %d. Must be between 0 and %d "
+            "(less than the number of shards)."
+            % (options.shard_index, options.shard_count - 1)
+        )
+
+    # Check that all test binaries have an unique basename. That way we can ensure
+    # the logs are saved to unique files even when two different binaries have
+    # common tests.
+    unique_binaries = set(os.path.basename(binary) for binary in binaries)
+    assert len(unique_binaries) == len(binaries), (
+        "All test binaries must have an unique basename."
+    )
+
+    if options.output_dir:
+        # Remove files from old test runs.
+        if os.path.isdir(options.output_dir):
+            shutil.rmtree(options.output_dir)
+        # Create directory for test log output.
+        try:
+            os.makedirs(options.output_dir)
+        except OSError as e:
+            # Ignore errors if this directory already exists.
+            if e.errno != errno.EEXIST or not os.path.isdir(options.output_dir):
+                raise e
+
+    test_results = None
+    if options.dump_json_test_results is not None:
+        test_results = CollectTestResults(options.dump_json_test_results)
+
+    save_file = get_save_file_path()
+
+    times = TestTimes(save_file)
+    logger = FilterFormat(options.output_dir)
+
+    task_manager = TaskManager(
+        times, logger, test_results, Task, options.retry_failed, options.repeat + 1
+    )
+
+    # Get available GPUs based on HIP_VISIBLE_DEVICES and --gpus option
+    available_gpus = get_available_gpus(options.gpus) if options.gpus > 0 else None
+
+    tasks = find_tests(binaries, additional_args, options, times)
+    logger.log_tasks(len(tasks))
+    execute_tasks(
+        tasks,
+        options.workers,
+        task_manager,
+        options.timeout,
+        options.timeout_per_test,
+        options.serialize_test_cases,
+        available_gpus,
+    )
+
+    print_try_number = options.retry_failed > 0 or options.repeat > 1
+    if task_manager.passed:
+        logger.move_to("passed", task_manager.passed)
+        if options.print_test_times:
+            logger.print_tests(
+                "PASSED TESTS",
+                task_manager.passed,
+                print_try_number,
+                options.print_test_command,
+            )
+
+    if task_manager.failed:
+        logger.print_tests(
+            "FAILED TESTS",
+            task_manager.failed,
+            print_try_number,
+            options.print_test_command,
+        )
+        logger.move_to("failed", task_manager.failed)
+
+    if task_manager.timed_out:
+        logger.print_tests(
+            "TIMED OUT TESTS",
+            task_manager.timed_out,
+            print_try_number,
+            options.print_test_command,
+        )
+        logger.move_to("timed_out", task_manager.timed_out)
+
+    if task_manager.started:
+        logger.print_tests(
+            "INTERRUPTED TESTS",
+            task_manager.started.values(),
+            print_try_number,
+            options.print_test_command,
+        )
+        logger.move_to("interrupted", task_manager.started.values())
+
+    if options.repeat > 1 and (task_manager.failed or task_manager.started):
+        logger.summarize(
+            task_manager.passed, task_manager.failed, task_manager.started.values()
+        )
+
+    logger.flush()
+    times.write_to_file(save_file)
+    if test_results:
+        test_results.dump_to_file_and_close()
+
+    if sigint_handler.got_sigint():
+        return -signal.SIGINT
+
+    return task_manager.global_exit_code
+
+
+if __name__ == "__main__":
+    sys.exit(main())