diff --git a/python/src/main.cpp b/python/src/main.cpp index 83807b5..8f653b6 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -32,15 +32,24 @@ PYBIND11_MODULE(kp, m) { .value("storage", kp::Tensor::TensorTypes::eStorage, "Tensor with host visible gpu memory.") .export_values(); + + + + py::class_>(m, "Tensor", DOC(kp, Tensor)) .def(py::init( - [](const std::vector& data) { - return std::unique_ptr(new kp::Tensor(data)); - }), DOC(kp, Tensor, Tensor, 2)) - .def(py::init( - [](const std::vector& data, kp::Tensor::TensorTypes tensorTypes) { - return std::unique_ptr(new kp::Tensor(data, tensorTypes)); - }), "Initialiser with list of data components and tensor GPU memory type.") + [np](const py::array_t data, kp::Tensor::TensorTypes tensor_type) { + const py::array_t flatdata = np.attr("ravel")(data); + const py::buffer_info info = flatdata.request(); + const float* ptr = (float*) info.ptr; + return std::unique_ptr( + new kp::Tensor(std::vector(ptr, ptr+flatdata.size()), tensor_type) + ); + }), + "Construct Tensor with an array as initial data and an optional kp.TensorType (default:device).", + py::arg("data"), + py::arg("tensor_type") = kp::Tensor::TensorTypes::eDevice + ) .def("data", &kp::Tensor::data, DOC(kp, Tensor, data)) .def("numpy", [](kp::Tensor& self) { return py::array(self.data().size(), self.data().data()); @@ -82,19 +91,27 @@ PYBIND11_MODULE(kp, m) { .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.") .def("map_data_into_host", &kp::Tensor::mapDataIntoHostMemory, "Maps data from GPU memory into tensor local data."); + + + + py::class_>(m, "Sequence") .def("init", &kp::Sequence::init, "Initialises Vulkan resources within sequence using provided device.") + // record .def("begin", &kp::Sequence::begin, "Clears previous commands and starts recording commands in sequence which can be run in batch.") .def("end", &kp::Sequence::end, "Stops listening and recording for new commands.") + // eval .def("eval", &kp::Sequence::eval, "Executes the currently recorded commands synchronously by waiting on Vulkan Fence.") .def("eval_async", &kp::Sequence::evalAsync, "Executes the currently recorded commands asynchronously.") .def("eval_await", &kp::Sequence::evalAwait, "Waits until the execution finishes using Vulkan Fence.") + // status .def("is_running", &kp::Sequence::isRunning, "Checks whether the Sequence operations are currently still executing.") .def("is_rec", &kp::Sequence::isRecording, "Checks whether the Sequence is currently in recording mode.") .def("is_init", &kp::Sequence::isInit, "Checks if the Sequence has been initialized") + // record .def("record_tensor_create", &kp::Sequence::record, "Records operation to create and initialise tensor GPU memory and buffer") @@ -106,23 +123,41 @@ PYBIND11_MODULE(kp, m) { "Records operation to sync tensor(s) from GPU memory to local memory using staging tensors") .def("record_algo_mult", &kp::Sequence::record, "Records operation to run multiplication compute shader to two input tensors and an output tensor") - .def("record_algo_file", &kp::Sequence::record, - "Records an operation using a custom shader provided from a shader path") + .def("record_algo_file", [](kp::Sequence &self, + std::vector> tensors, + const std::string& file_path, + std::tuple work_group) -> bool { + const kp::OpAlgoBase::KomputeWorkgroup wgroup{ + std::get<0>(work_group), std::get<1>(work_group), std::get<2>(work_group), + }; + return self.record(tensors, file_path, wgroup); + }, + "Records an operation using a custom shader provided from a shader path", + py::arg("tensors"), py::arg("file_path"), py::arg("work_group") = std::make_tuple(0,0,0) ) .def("record_algo_data", [](kp::Sequence &self, std::vector> tensors, - py::bytes &bytes) -> float { + py::bytes &bytes, + std::tuple work_group) -> bool { // Bytes have to be converted into std::vector py::buffer_info info(py::buffer(bytes).request()); const char *data = reinterpret_cast(info.ptr); size_t length = static_cast(info.size); + const kp::OpAlgoBase::KomputeWorkgroup wgroup{ + std::get<0>(work_group), std::get<1>(work_group), std::get<2>(work_group), + }; return self.record( - tensors, - std::vector(data, data + length)); + tensors, std::vector(data, data + length), wgroup + ); }, - "Records an operation using a custom shader provided as raw string or spirv bytes") + "Records an operation using a custom shader provided as spirv bytes", + py::arg("tensors"), py::arg("bytes"), py::arg("work_group") = std::make_tuple(0,0,0) ) .def("record_algo_lro", &kp::Sequence::record, "Records operation to run left right out operation with custom shader"); + + + + py::class_(m, "Manager") .def(py::init(), "Default initializer uses device 0 and first compute compatible GPU queueFamily") .def(py::init( @@ -139,12 +174,14 @@ PYBIND11_MODULE(kp, m) { .def("build_tensor", &kp::Manager::buildTensor, py::arg("data"), py::arg("tensorType") = kp::Tensor::TensorTypes::eDevice, "Build and initialise tensor") + // Await functions .def("eval_await", &kp::Manager::evalOpAwait, py::arg("sequenceName"), py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on a named Sequence") .def("eval_await_def", &kp::Manager::evalOpAwaitDefault, py::arg("waitFor") = UINT64_MAX, "Awaits for asynchronous operation on the last anonymous Sequence created") + // eval default .def("eval_tensor_create_def", &kp::Manager::evalOpDefault, "Evaluates operation to create and initialise tensor GPU memory and buffer with new anonymous Sequence") @@ -181,6 +218,7 @@ PYBIND11_MODULE(kp, m) { "Evaluates an operation using a custom shader provided as spirv bytes with new anonymous Sequence") .def("eval_algo_lro_def", &kp::Manager::evalOpDefault, "Evaluates operation to run left right out operation with custom shader with new anonymous Sequence") + // eval .def("eval_tensor_create", &kp::Manager::evalOp, "Evaluates operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence") @@ -220,6 +258,7 @@ PYBIND11_MODULE(kp, m) { "Evaluates an operation using a custom shader provided as spirv bytes with explicitly named Sequence") .def("eval_algo_lro", &kp::Manager::evalOp, "Evaluates operation to run left right out operation with custom shader with explicitly named Sequence") + // eval async default .def("eval_async_tensor_create_def", &kp::Manager::evalOpAsyncDefault, "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with anonymous Sequence") @@ -256,6 +295,7 @@ PYBIND11_MODULE(kp, m) { "Evaluates asynchronously an operation using a custom shader provided as raw string or spirv bytes with anonymous Sequence") .def("eval_async_algo_lro_def", &kp::Manager::evalOpAsyncDefault, "Evaluates asynchronously operation to run left right out operation with custom shader with anonymous Sequence") + // eval async .def("eval_async_tensor_create", &kp::Manager::evalOpAsync, "Evaluates asynchronously operation to create and initialise tensor GPU memory and buffer with explicitly named Sequence") diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py index 2232bea..9dee9df 100644 --- a/python/test/test_kompute.py +++ b/python/test/test_kompute.py @@ -110,3 +110,40 @@ def test_sequence(): assert tensor_out.data() == [2.0, 4.0, 6.0] assert np.all(tensor_out.numpy() == [2.0, 4.0, 6.0]) + + + +def test_workgroup(): + mgr = kp.Manager(0) + + tensor_a = kp.Tensor(np.zeros([16,8])) + tensor_b = kp.Tensor(np.zeros([16,8])) + mgr.eval_tensor_create_def([tensor_a, tensor_b]) + + shader_src = """ + #version 450 + + layout (local_size_x = 1) in; + + // The input tensors bind index is relative to index in parameter passed + layout(set = 0, binding = 0) writeonly buffer bout { float toutx[]; }; + layout(set = 0, binding = 1) writeonly buffer bout2 { float touty[]; }; + + void main() { + uint index = gl_WorkGroupID.x*gl_NumWorkGroups.y + gl_WorkGroupID.y; + + toutx[index] = gl_GlobalInvocationID.x; + touty[index] = gl_GlobalInvocationID.y; + } + """ + shader_src = bytes(shader_src, encoding='utf8') + + seq = mgr.create_sequence() + seq.begin() + seq.record_algo_data([tensor_a, tensor_b], shader_src, (16,8,1)) + seq.end() + seq.eval() + + mgr.eval_tensor_sync_local_def([tensor_a, tensor_b]) + assert np.all(tensor_a.numpy() == np.stack([np.arange(16)]*8, axis=1).ravel()) + assert np.all(tensor_b.numpy() == np.stack([np.arange(8)]*16, axis=0).ravel()) diff --git a/src/OpAlgoBase.cpp b/src/OpAlgoBase.cpp index b3ffbf7..c6ecf31 100644 --- a/src/OpAlgoBase.cpp +++ b/src/OpAlgoBase.cpp @@ -25,7 +25,7 @@ OpAlgoBase::OpAlgoBase(std::shared_ptr physicalDevice, // If at least the x value is provided we use mainly the parameters // provided this->mKomputeWorkgroup = { - 0, + komputeWorkgroup.x, komputeWorkgroup.y > 0 ? komputeWorkgroup.y : 1, komputeWorkgroup.z > 0 ? komputeWorkgroup.z : 1 };