diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp index 80db615..bce53c6 100644 --- a/src/Algorithm.cpp +++ b/src/Algorithm.cpp @@ -40,7 +40,7 @@ Algorithm::init(const std::vector& shaderFileData, this->createShaderModule(shaderFileData); std::vector sizes; - for (std::shared_ptr tensor: tensorParams) { + for (std::shared_ptr tensor : tensorParams) { SPDLOG_WARN("size: {}", tensor->size()); sizes.push_back(tensor->size()); } @@ -175,19 +175,19 @@ Algorithm::createPipeline(std::vector specializationData) std::vector specializationEntries; for (size_t i = 0; i < specializationData.size(); i++) { - vk::SpecializationMapEntry specializationEntry( - static_cast(i), - static_cast(sizeof(uint32_t) * i), - sizeof(uint32_t)); + vk::SpecializationMapEntry specializationEntry( + static_cast(i), + static_cast(sizeof(uint32_t) * i), + sizeof(uint32_t)); specializationEntries.push_back(specializationEntry); } - vk::SpecializationInfo specializationInfo( - static_cast(specializationEntries.size()), - specializationEntries.data(), - sizeof(uint32_t) * specializationEntries.size(), - specializationData.data()); + vk::SpecializationInfo specializationInfo( + static_cast(specializationEntries.size()), + specializationEntries.data(), + sizeof(uint32_t) * specializationEntries.size(), + specializationData.data()); vk::PipelineShaderStageCreateInfo shaderStage( vk::PipelineShaderStageCreateFlags(), diff --git a/src/include/kompute/Manager.hpp b/src/include/kompute/Manager.hpp index 6fb2a03..0d10515 100644 --- a/src/include/kompute/Manager.hpp +++ b/src/include/kompute/Manager.hpp @@ -67,7 +67,8 @@ class Manager * * @param tensors The tensors to be used in the operation recorded * @param sequenceName The name of the sequence to be retrieved or created - * @param TArgs Template parameters that will be used to initialise Operation to allow for extensible configurations on initialisation + * @param TArgs Template parameters that will be used to initialise + * Operation to allow for extensible configurations on initialisation */ template void evalOp(std::vector> tensors, diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index 25e2966..466af2e 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -79,7 +79,8 @@ class Sequence * not be able to add the operation. * * @param tensors Vector of tensors to use for the operation - * @param TArgs Template parameters that are used to initialise operation which allows for extensible configurations on initialisation. + * @param TArgs Template parameters that are used to initialise operation + * which allows for extensible configurations on initialisation. */ template bool record(std::vector> tensors, TArgs&&... params) diff --git a/src/include/kompute/operations/OpAlgoAllInOut.hpp b/src/include/kompute/operations/OpAlgoAllInOut.hpp deleted file mode 100644 index b4fe53a..0000000 --- a/src/include/kompute/operations/OpAlgoAllInOut.hpp +++ /dev/null @@ -1,207 +0,0 @@ -#pragma once - -#include - -#include "kompute/Core.hpp" - -#include "kompute/Algorithm.hpp" -#include "kompute/Tensor.hpp" - -#include "kompute/operations/OpAlgoBase.hpp" - -namespace kp { - -/** - * Operation base class to simplify the creation of operations that require - * multiple unknown number of tensors, all which will be expected to be - * Device storage tensors with the data already stored. All the tensors - * will also be used as outputs so the data will be copied from the device - * into the respective tensors. - * The template parameters specify the processing GPU layout number of - * iterations for each x, y, z parameter. More specifically, this will be the - * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)" - */ -template -class OpAlgoAllInOut : public OpAlgoBase -{ - public: - /** - * Base constructor, should not be used unless explicitly intended. - */ - OpAlgoAllInOut(); - - /** - * Default constructor with parameters that provides the bare minimum - * requirements for the operations to be able to create and manage their - * sub-components. - * - * @param physicalDevice Vulkan physical device used to find device queues - * @param device Vulkan logical device for passing to Algorithm - * @param commandBuffer Vulkan Command Buffer to record commands into - * @param tensors Tensors that are to be used in this operation - * @param freeTensors Whether operation manages the memory of the Tensors - */ - OpAlgoAllInOut(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector>& tensors); - - /** - * Default destructor, which is in charge of destroying the algorithm - * components but does not destroy the underlying tensors - */ - ~OpAlgoAllInOut(); - - /** - * The init function is responsible for ensuring that all of the tensors - * passed into the function have been initialised and are of type Device. - * This is required as the parameters provided are expected to be - * used as storage buffers, as well as output buffers, so the data will - * be transferred out from the Device into the Tensors replacing existing - * data. - */ - void init() override; - - /** - * This records the commands that are to be sent to the GPU. This includes - * the barriers that ensure the memory has been copied before going in and - * out of the shader, as well as the dispatch operation that sends the - * shader processing to the gpu. This function also records the GPU memory - * copy of the output data for the staging bufffer so it can be read by the - * host. - */ - void record() override; - - /** - * Executes after the recorded commands are submitted, and performs a copy - * of the GPU Device memory into the staging buffer so the output data can - * be retrieved. - */ - void postSubmit() override; - - protected: - // -------------- ALWAYS OWNED RESOURCES - std::vector> mOutputStagingTensors; ///< Array of output staging tensors which will be expected to be the same size as the number of inputs. -}; - -} // End namespace kp - -// Including implemenation for template class -#ifndef OPALGOALLINOUT_CPP -#define OPALGOALLINOUT_CPP - -namespace kp { - -template -OpAlgoAllInOut::OpAlgoAllInOut() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor base"); -} - -template -OpAlgoAllInOut::OpAlgoAllInOut(std::shared_ptr physicalDevice, - std::shared_ptr device, - std::shared_ptr commandBuffer, - std::vector>& tensors) - : OpAlgoBase(physicalDevice, device, commandBuffer, tensors) -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut constructor with params"); -} - -template -OpAlgoAllInOut::~OpAlgoAllInOut() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut destructor started"); - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut destroying staging tensors"); - for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { - stagingTensor->freeMemoryDestroyGPUResources(); - } -} - -template -void -OpAlgoAllInOut::init() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut init called"); - - if (this->mTensors.size() < 1) { - throw std::runtime_error( - "Kompute OpAlgoAllInOut called with less than 1 tensor"); - } - - for (std::shared_ptr tensor : this->mTensors) { - if(!tensor->isInit()) { - throw std::runtime_error("Kompute OpAlgoAllInOut validation failed; all tensor parameters must be initialised."); - } - } - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut creating staging output tensors"); - - for (std::shared_ptr tensor : this->mTensors) { - std::shared_ptr stagingTensor = std::make_shared( - tensor->data(), Tensor::TensorTypes::eStaging); - stagingTensor->init( - this->mPhysicalDevice, this->mDevice, this->mCommandBuffer); - this->mOutputStagingTensors.push_back(stagingTensor); - } - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut fetching spirv data"); - - std::vector& shaderFileData = this->fetchSpirvBinaryData(); - - SPDLOG_DEBUG("Kompute OpAlgoAllInOut Initialising algorithm component"); - - this->mAlgorithm->init(shaderFileData, this->mTensors); -} - -template -void -OpAlgoAllInOut::record() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut record called"); - - // Barrier to ensure the data is finished writing to buffer memory - for (std::shared_ptr tensor : this->mTensors) { - tensor->recordBufferMemoryBarrier( - vk::AccessFlagBits::eHostWrite, - vk::AccessFlagBits::eShaderRead, - vk::PipelineStageFlagBits::eHost, - vk::PipelineStageFlagBits::eComputeShader); - } - - this->mAlgorithm->recordDispatch(this->mX, this->mY, this->mZ); - - // Barrier to ensure the shader code is executed before buffer read - for (std::shared_ptr tensor : this->mTensors) { - tensor->recordBufferMemoryBarrier( - vk::AccessFlagBits::eShaderWrite, - vk::AccessFlagBits::eTransferRead, - vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eTransfer); - } - - // Record copy from and create barrier for STAGING tensors - for (std::shared_ptr stagingTensor : this->mOutputStagingTensors) { - stagingTensor->recordCopyFrom(this->mTensorOutput, true); - } -} - -template -void -OpAlgoAllInOut::postSubmit() -{ - SPDLOG_DEBUG("Kompute OpAlgoAllInOut postSubmit called"); - - for (size_t i = 0; i < this->mTensors.size(); i++) { - this->mOutputStagingTensors[i]->mapDataFromHostMemory(); - - this->mTensors[i]->setData(this->mOutputStagingTensors[i]->data()); - } -} - -} - -#endif // #ifndef OPALGOALLINOUT_CPP - - diff --git a/src/include/kompute/operations/OpAlgoBase.hpp b/src/include/kompute/operations/OpAlgoBase.hpp index 1ee42b4..72bb999 100644 --- a/src/include/kompute/operations/OpAlgoBase.hpp +++ b/src/include/kompute/operations/OpAlgoBase.hpp @@ -16,6 +16,18 @@ namespace kp { /** * Operation that provides a general abstraction that simplifies the use of * algorithm and parameter components which can be used with shaders. + * By default it enables the user to provide a dynamic number of tensors + * which are then passed as inputs. + * + * All of these tensors are expected to be initlaised and this is checked with throw std exception in the init function. + * + * It is possible to also choose if the user requires all of the tensors to be + * copied from device memory to their host data. This can be disabled by either + * passing the copyOutputData constructor parameter and/or by overriding the + * functions to carry out copy commands accordingly. + * + * See OpLhsRhsOut for an example implementation on a more specific granularity on tensor parameters. + * * The template parameters specify the processing GPU layout number of * iterations for each x, y, z parameter. More specifically, this will be the * input to ".dispatch(uint32_t tX, uint32_t tY, uint32_t, tZ)"