#include "gtest/gtest.h" #include #include "kompute/Kompute.hpp" TEST(TestAsyncOperations, TestManagerAsync) { uint32_t size = 10; uint32_t numParallel = 2; std::string shader(R"( #version 450 layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer a { float pa[]; }; layout(set = 0, binding = 1) buffer b { float pb[]; }; shared uint sharedTotal[1]; void main() { uint index = gl_GlobalInvocationID.x; sharedTotal[0] = 0; barrier(); memoryBarrierShared(); for (int i = 0; i < 100000000; i++) { atomicAdd(sharedTotal[0], 1); atomicAdd(sharedTotal[0], -1); atomicAdd(sharedTotal[0], 1); atomicAdd(sharedTotal[0], -1); atomicAdd(sharedTotal[0], 1); atomicAdd(sharedTotal[0], -1); atomicAdd(sharedTotal[0], 1); } barrier(); memoryBarrierShared(); pb[index] = sharedTotal[0]; pa[index] = 0; } )"); std::vector data(size, 0.0); std::vector resultSync(size, 100000000); std::vector resultAsync(size, 100000000); kp::Manager mgr; std::vector> inputsSyncA; std::vector> inputsSyncB; for (uint32_t i = 0; i < numParallel; i++) { inputsSyncA.push_back(std::make_shared(kp::Tensor(data))); inputsSyncB.push_back(std::make_shared(kp::Tensor(data))); } mgr.evalOpDefault(inputsSyncA); mgr.evalOpDefault(inputsSyncB); auto startSync = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < numParallel; i++) { mgr.evalOpDefault>( { inputsSyncA[i], inputsSyncB[i] }, std::vector(shader.begin(), shader.end())); } auto endSync = std::chrono::high_resolution_clock::now(); auto durationSync = std::chrono::duration_cast(endSync - startSync).count(); mgr.evalOpDefault(inputsSyncB); for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ(inputsSyncB[i]->data(), resultSync); } kp::Manager mgrAsync(0, {0, 2}); std::vector> inputsAsyncA; std::vector> inputsAsyncB; for (uint32_t i = 0; i < numParallel; i++) { inputsAsyncA.push_back(std::make_shared(kp::Tensor(data))); inputsAsyncB.push_back(std::make_shared(kp::Tensor(data))); } mgrAsync.evalOpDefault(inputsAsyncA); mgrAsync.evalOpDefault(inputsAsyncB); for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.createManagedSequence("async" + std::to_string(i), i); } auto startAsync = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAsync>( { inputsAsyncA[i], inputsAsyncB[i] }, "async" + std::to_string(i), std::vector(shader.begin(), shader.end())); } // TODO: Add function to print device details (or link) // TODO: Seems to fail if await called twice for (uint32_t i = 0; i < numParallel; i++) { mgrAsync.evalOpAwait("async" + std::to_string(i)); } auto endAsync = std::chrono::high_resolution_clock::now(); auto durationAsync = std::chrono::duration_cast(endAsync - startAsync).count(); mgrAsync.evalOpDefault({ inputsAsyncB }); for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ(inputsAsyncB[i]->data(), resultAsync); } // The speedup should be at least 40% EXPECT_LT(durationAsync, durationSync * 0.6); }