From d59dc41ffcbac3bd2ba00eebebe1db13ffb15685 Mon Sep 17 00:00:00 2001 From: Alejandro Saucedo Date: Fri, 21 Aug 2020 19:15:07 +0100 Subject: [PATCH] Added initial implementation for algorithm and opMult --- src/Algorithm.cpp | 178 +++++++++++++++++++++++++++++++++++++++++ src/Algorithm.hpp | 32 +++++++- src/OpBase.hpp | 6 ++ src/OpCreateTensor.cpp | 6 ++ src/OpCreateTensor.hpp | 2 + src/OpMult.cpp | 29 ++++++- src/OpMult.hpp | 10 ++- src/Parameter.hpp | 52 +++++++++++- src/Tensor.cpp | 62 +++++++++++--- src/Tensor.hpp | 10 +++ src/main.cpp | 26 ++++-- 11 files changed, 385 insertions(+), 28 deletions(-) create mode 100644 src/Algorithm.cpp diff --git a/src/Algorithm.cpp b/src/Algorithm.cpp new file mode 100644 index 0000000..ced642d --- /dev/null +++ b/src/Algorithm.cpp @@ -0,0 +1,178 @@ +#include + +#include "Algorithm.hpp" + +namespace kp { + +Algorithm::Algorithm() +{ + SPDLOG_DEBUG("Kompute Algorithm base constructor"); +} + +Algorithm::Algorithm(std::shared_ptr device, std::shared_ptr commandBuffer) +{ + SPDLOG_DEBUG("Kompute Algorithm Constructor with device"); + + this->mDevice = device; + this->mCommandBuffer = commandBuffer; +} + +Algorithm::~Algorithm() +{ + SPDLOG_DEBUG("Kompute Algorithm Destructor started"); + + if (!this->mDevice) { + spdlog::error( + "Kompute Algorithm destructor reached with null Device pointer"); + return; + } +} + +void Algorithm::init(std::string shaderFilePath, + std::vector> tensorParams) { + SPDLOG_DEBUG("Kompute Algorithm init started"); + + spdlog::info("Loading shader with file path {}", shaderFilePath); + + // TODO: Move to util function + this->createParameters(tensorParams); + this->createShaderModule(shaderFilePath); + this->createPipeline(); +} + +void Algorithm::createParameters(std::vector>& tensorParams) { + std::vector descriptorPoolSizes; + + for (std::shared_ptr tensorParam : tensorParams) { + descriptorPoolSizes.push_back( + vk::DescriptorPoolSize( + vk::DescriptorType::eStorageBuffer, + 1 // Descriptor count + ) + ); + } + + // TODO: Explore design for having more than 1 set configurable + vk::DescriptorPoolCreateInfo descriptorPoolInfo( + vk::DescriptorPoolCreateFlags(), + 1, // Max sets + descriptorPoolSizes.size(), + descriptorPoolSizes.data()); + + this->mDescriptorPool = std::make_shared(); + this->mDevice->createDescriptorPool(&descriptorPoolInfo, nullptr, this->mDescriptorPool.get()); + + // TODO: Explore allowing descriptor set bind index + std::vector descriptorSetBindings; + for (size_t i = 0; i < tensorParams.size(); i++) { + descriptorSetBindings.push_back( + vk::DescriptorSetLayoutBinding( + i, // Binding index + vk::DescriptorType::eStorageBuffer, + 1, // Descriptor count + vk::ShaderStageFlagBits::eCompute) + ); + } + + // This is the component that is fed into the pipeline + vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo( + vk::DescriptorSetLayoutCreateFlags(), + descriptorSetBindings.size(), + descriptorSetBindings.data() + ); + + // TODO: We createa signle descriptor set layout which would have to be extended if multiple set layouts to be supported + this->mDescriptorSetLayout = std::make_shared(); + this->mDevice->createDescriptorSetLayout(&descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get()); + + vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo( + *this->mDescriptorPool, + 1, // Descriptor set layout count + this->mDescriptorSetLayout.get()); + + std::vector descriptorSets = + this->mDevice->allocateDescriptorSets(descriptorSetAllocateInfo); + + if (descriptorSets.size() != tensorParams.size()) { + throw std::runtime_error("Number of descriptor sets does not match number of paramters"); + } + + std::vector computeWriteDescriptorSets; + for (size_t i = 0; i < descriptorSets.size(); i++) { + + std::shared_ptr currTensor = tensorParams[i]; + vk::DescriptorSet& currDescriptorSet = descriptorSets[i]; + this->mDescriptorSets.push_back(std::make_shared(currDescriptorSet)); + + vk::DescriptorBufferInfo descriptorBufferInfo = currTensor->constructDescriptorBufferInfo(); + + computeWriteDescriptorSets.push_back( + vk::WriteDescriptorSet()); + } + + this->mDevice->updateDescriptorSets(computeWriteDescriptorSets, nullptr); +} + +void Algorithm::createShaderModule(std::string shaderFilePath) { + std::ifstream fileStream( + shaderFilePath, std::ios::binary | std::ios::in | std::ios::ate); + + size_t shaderFileSize = fileStream.tellg(); + fileStream.seekg(0, std::ios::beg); + char* shaderFileData = new char[shaderFileSize]; + fileStream.read(shaderFileData, shaderFileSize); + fileStream.close(); + + vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(), shaderFileSize, (uint32_t*)shaderFileData); + + this->mFreeShaderModule = true; + this->mShaderModule = std::shared_ptr(); + this->mDevice->createShaderModule(&shaderModuleInfo, nullptr, this->mShaderModule.get()); +} + +void Algorithm::createPipeline() { + SPDLOG_DEBUG("Kompute Algorithm calling create Pipeline"); + + vk::PipelineLayoutCreateInfo pipelineLayoutInfo( + vk::PipelineLayoutCreateFlags(), + 1, // Set layout count + this->mDescriptorSetLayout.get()); + + this->mPipelineLayout = std::make_shared(); + this->mDevice->createPipelineLayout(&pipelineLayoutInfo, nullptr, this->mPipelineLayout.get()); + + vk::PipelineShaderStageCreateInfo shaderStage(vk::PipelineShaderStageCreateFlags(), vk::ShaderStageFlagBits::eCompute, *this->mShaderModule, "main", nullptr); + + vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(), shaderStage, *this->mPipelineLayout, vk::Pipeline(), 0); + + // TODO: Confirm what the best structure is with pipeline cache + this->mFreePipelineCache = true; + this->mPipelineCache = std::make_shared(vk::PipelineCacheCreateInfo()); + + vk::ResultValue pipelineResult = this->mDevice->createComputePipeline(*this->mPipelineCache, pipelineInfo); + + if (pipelineResult.result != vk::Result::eSuccess) { + throw std::runtime_error("Failed to create pipeline result: " + vk::to_string(pipelineResult.result)); + } + + this->mFreePipeline = true; + this->mPipeline = std::make_shared(pipelineResult.value); +} + +void Algorithm::recordDispatch(uint32_t x, uint32_t y, uint32_t z) { + SPDLOG_DEBUG("Kompute Algorithm calling record dispatch"); + + this->mCommandBuffer->bindPipeline(vk::PipelineBindPoint::eCompute, *this->mPipeline); + + // TODO: Simplify interaction given we store array of pointers + std::vector descriptorSetRefs(this->mDescriptorSets.size()); + for (size_t i = 0; i < this->mDescriptorSets.size(); i++) { + descriptorSetRefs[i] = this->mDescriptorSets[i]; + } + + this->mCommandBuffer->bindDescriptorSets(vk::PipelineBindPoint::eCompute, *this->mPipelineLayout, 0, descriptorSetRefs, nullptr); + + this->mCommandBuffer->dispatch(x, y, z); +} + +} diff --git a/src/Algorithm.hpp b/src/Algorithm.hpp index 878d4df..7620ece 100644 --- a/src/Algorithm.hpp +++ b/src/Algorithm.hpp @@ -19,13 +19,43 @@ class Algorithm public: Algorithm(); - Algorithm(std::shared_ptr device); + Algorithm(std::shared_ptr device, std::shared_ptr commandBuffer); // TODO: Add specialisation data + // TODO: Explore other ways of passing shader (ie raw bytes) void init(std::string shaderFilePath, std::vector> tensorParams); ~Algorithm(); + + // Record commands + void recordDispatch(uint32_t x, uint32_t y, uint32_t z); + +private: + // Shared resources + std::shared_ptr mDevice; + std::shared_ptr mCommandBuffer; + + // Resources owned by default + std::shared_ptr mDescriptorSetLayout; + bool mFreeDescriptorSetLayout = false; + std::shared_ptr mDescriptorPool; + bool mFreeDescriptorPool = false; + std::vector> mDescriptorSets; + bool mFreeDescriptorSet = false; + std::shared_ptr mShaderModule; + bool mFreeShaderModule = false; + std::shared_ptr mPipelineLayout; + bool mFreePipelineLayout = false; + std::shared_ptr mPipelineCache; + bool mFreePipelineCache = false; + std::shared_ptr mPipeline; + bool mFreePipeline = false; + + // Create util functions + void createParameters(std::vector>& tensorParams); + void createShaderModule(std::string shaderFilePath); + void createPipeline(); }; } // End namespace kp diff --git a/src/OpBase.hpp b/src/OpBase.hpp index c2f6493..a4ce407 100644 --- a/src/OpBase.hpp +++ b/src/OpBase.hpp @@ -42,6 +42,12 @@ class OpBase virtual void record() { SPDLOG_DEBUG("Kompute OpBase record called"); } + virtual void postSubmit() + { + SPDLOG_DEBUG("Kompute OpBase init called"); + } + + protected: std::shared_ptr mPhysicalDevice; std::shared_ptr mDevice; diff --git a/src/OpCreateTensor.cpp b/src/OpCreateTensor.cpp index 8284b3b..cc82b9d 100644 --- a/src/OpCreateTensor.cpp +++ b/src/OpCreateTensor.cpp @@ -65,4 +65,10 @@ OpCreateTensor::record() } } +void OpCreateTensor::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpCreateTensor postSubmit called"); + +} + } diff --git a/src/OpCreateTensor.hpp b/src/OpCreateTensor.hpp index b5fb16d..3ca15e6 100644 --- a/src/OpCreateTensor.hpp +++ b/src/OpCreateTensor.hpp @@ -31,6 +31,8 @@ class OpCreateTensor : public OpBase void record() override; + void postSubmit() override; + private: std::shared_ptr mPrimaryTensor; std::shared_ptr mStagingTensor; diff --git a/src/OpMult.cpp b/src/OpMult.cpp index 6d4c49a..89217e8 100644 --- a/src/OpMult.cpp +++ b/src/OpMult.cpp @@ -11,12 +11,15 @@ OpMult::OpMult() SPDLOG_DEBUG("Kompute OpMult constructor base"); } +// TODO: Remove physicalDevice from main initialiser OpMult::OpMult(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr commandBuffer) : OpBase(physicalDevice, device, commandBuffer) { SPDLOG_DEBUG("Kompute OpMult constructor with params"); + + this->mAlgorithm = Algorithm(device, commandBuffer); } OpMult::~OpMult() @@ -29,18 +32,40 @@ OpMult::init(std::vector> tensors) { SPDLOG_DEBUG("Kompute OpMult init called"); - if (tensors.size() < 2) { + if (tensors.size() < 3) { throw std::runtime_error( "Kompute OpMult called with less than 1 tensor"); - } else if (tensors.size() > 2) { + } else if (tensors.size() > 3) { spdlog::warn("Kompute OpMult called with more than 2 tensor"); } + + this->mTensorLHS = tensors[0]; + this->mTensorRHS = tensors[1]; + this->mTensorOutput = tensors[2]; + + this->mTensorOutputStaging= std::make_shared( + this->mTensorOutput->data(), Tensor::TensorTypes::eStaging); + + this->mAlgorithm.init( + "shaders/glsl/computeheadless.comp.spv", tensors); } void OpMult::record() { SPDLOG_DEBUG("Kompute OpMult record called"); + + this->mAlgorithm.recordDispatch(1, 1, 1); + + this->mTensorOutputStaging->recordCopyFrom(this->mTensorOutput); +} + +void OpMult::postSubmit() +{ + SPDLOG_DEBUG("Kompute OpCreateTensor postSubmit called"); + + this->mTensorOutputStaging->copyDataFromHostBuffer(); + this->mTensorOutput->setData(this->mTensorOutputStaging->data()); } } diff --git a/src/OpMult.hpp b/src/OpMult.hpp index 16a9bae..364029f 100644 --- a/src/OpMult.hpp +++ b/src/OpMult.hpp @@ -11,6 +11,7 @@ #include #include "Tensor.hpp" +#include "Algorithm.hpp" #include "OpBase.hpp" @@ -31,9 +32,14 @@ class OpMult : public OpBase void record() override; + void postSubmit() override; + private: - std::shared_ptr mPrimaryTensor; - std::shared_ptr mStagingTensor; + Algorithm mAlgorithm; + std::shared_ptr mTensorLHS; + std::shared_ptr mTensorRHS; + std::shared_ptr mTensorOutput; + std::shared_ptr mTensorOutputStaging; }; } // End namespace kp diff --git a/src/Parameter.hpp b/src/Parameter.hpp index a01a44a..739c638 100644 --- a/src/Parameter.hpp +++ b/src/Parameter.hpp @@ -1,13 +1,57 @@ #pragma once +#include +#include + +// SPDLOG_ACTIVE_LEVEL must be defined before spdlog.h import +#if DEBUG +#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG +#endif + +#include + +#include "Tensor.hpp" + namespace kp { -class Parameter +class Algorithm { - private: public: - Parameter(); - virtual ~Parameter(); + Algorithm(); + + Algorithm(std::shared_ptr device); + + // TODO: Add specialisation data + // TODO: Explore other ways of passing shader (ie raw bytes) + void init(std::string shaderFilePath, + std::vector> tensorParams); + + ~Algorithm(); + +private: + // Shared resources + std::shared_ptr mDevice; + + // Resources owned by default + std::shared_ptr mDescriptorSetLayout; + bool mFreeDescriptorSetLayout = false; + std::shared_ptr mDescriptorPool; + bool mFreeDescriptorPool = false; + std::shared_ptr mDescriptorSet; + bool mFreeDescriptorSet = false; + std::shared_ptr mShaderModule; + bool mFreeShaderModule = false; + std::shared_ptr mPipelineLayout; + bool mFreePipelineLayout = false; + std::shared_ptr mPipelineCache; + bool mFreePipelineCache = false; + std::shared_ptr mPipeline; + bool mFreePipeline = false; + + // Create util functions + void createParameters(); + void createShaderModule(std::string shaderFilePath); + void createPipeline(); }; } // End namespace kp diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 7efcdd2..d98105d 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -105,12 +105,16 @@ Tensor::isInit() return this->mIsInit; } +void Tensor::setData(const std::vector& data) { + this->mData = data; +} + void Tensor::recordCopyFrom(std::shared_ptr copyFromTensor) { SPDLOG_DEBUG("Kompute Tensor recordCopyFrom called"); - if (!this->mIsInit) { + if (!this->mIsInit || !copyFromTensor->mIsInit) { throw std::runtime_error( "Kompute Tensor attempted to run createBuffer without init"); } @@ -126,9 +130,53 @@ Tensor::recordCopyFrom(std::shared_ptr copyFromTensor) this->mCommandBuffer->copyBuffer( *copyFromTensor->mBuffer, *this->mBuffer, copyRegion); + // TODO: Ensure copied data is consistent with device this->mData = copyFromTensor->mData; } +// TODO: Explore if this function should be here or expose buffer +vk::DescriptorBufferInfo Tensor::constructDescriptorBufferInfo() { + return vk::DescriptorBufferInfo( + *this->mBuffer, + 0, // offset + this->memorySize() + ); +} + +void Tensor::copyDataFromHostBuffer() { + SPDLOG_DEBUG("Kompute Tensor copying data from host buffer"); + + if (this->mTensorType != TensorTypes::eStaging) { + spdlog::warn("Copying tensor data manually to DEVICE buffer instead of using record GPU command"); + } + + vk::DeviceSize bufferSize = this->memorySize(); + void* mapped = this->mDevice->mapMemory(*this->mMemory, 0, bufferSize, vk::MemoryMapFlags()); + vk::MappedMemoryRange mappedMemoryRange(*this->mMemory, 0, bufferSize); + this->mDevice->invalidateMappedMemoryRanges(mappedMemoryRange); + memcpy(this->mData.data(), mapped, bufferSize); + this->mDevice->unmapMemory(*this->mMemory); +} + +void Tensor::copyDataToHostBuffer() { + + SPDLOG_DEBUG("Kompute Tensor copying data to buffer"); + + if (this->mTensorType != TensorTypes::eStaging) { + spdlog::warn("Copying tensor data manually to DEVICE buffer instead of using record GPU command"); + } + + vk::DeviceSize bufferSize = this->memorySize(); + + // TODO: Verify if flushed memory ranges should happend in sequence + void* mapped = this->mDevice->mapMemory( + *this->mMemory, 0, bufferSize, vk::MemoryMapFlags()); + memcpy(mapped, this->mData.data(), bufferSize); + vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize); + this->mDevice->flushMappedMemoryRanges(1, &mappedRange); + this->mDevice->unmapMemory(*this->mMemory); +} + vk::BufferUsageFlags Tensor::getBufferUsageFlags() { @@ -249,17 +297,7 @@ Tensor::createBuffer(void* data) SPDLOG_DEBUG("Kompute Tensor buffer & memory creation successful"); if (data != nullptr) { - SPDLOG_DEBUG("Kompute Tensor mapping data to buffer"); - - // TODO: Verify if flushed memory ranges should happend in sequence - void* mapped = this->mDevice->mapMemory( - *this->mMemory, 0, bufferSize, vk::MemoryMapFlags()); - memcpy(mapped, data, bufferSize); - vk::MappedMemoryRange mappedRange(*this->mMemory, 0, bufferSize); - this->mDevice->flushMappedMemoryRanges(1, &mappedRange); - this->mDevice->unmapMemory(*this->mMemory); - - SPDLOG_DEBUG("Kompute Tensor successful copy data to tensor"); + this->copyDataToHostBuffer(); } } diff --git a/src/Tensor.hpp b/src/Tensor.hpp index 611b3c4..9880a80 100644 --- a/src/Tensor.hpp +++ b/src/Tensor.hpp @@ -46,8 +46,18 @@ class Tensor TensorTypes tensorType(); bool isInit(); + // Setters + void setData(const std::vector& data); + // Record functions void recordCopyFrom(std::shared_ptr copyFromTensor); + // TODO: Add memory buffer barrier capabilities + //void recordBufferMemoryBarrier(); + + // Util functions + vk::DescriptorBufferInfo constructDescriptorBufferInfo(); + void copyDataFromHostBuffer(); + void copyDataToHostBuffer(); private: std::shared_ptr mPhysicalDevice; diff --git a/src/main.cpp b/src/main.cpp index 5cfbf11..89616f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,7 @@ #include "Manager.hpp" #include "OpCreateTensor.hpp" +#include "OpMult.hpp" #include "Tensor.hpp" #define BUFFER_ELEMENTS 32 @@ -493,6 +494,7 @@ class VulkanCompute nullptr, bufferMemoryBarrier, nullptr); + this->mCommandBuffer.bindPipeline( vk::PipelineBindPoint::eCompute, this->mPipeline); this->mCommandBuffer.bindDescriptorSets( @@ -623,18 +625,28 @@ main() kp::Manager mgr; spdlog::info("Creating first tensor"); - std::shared_ptr tensorOne{ new kp::Tensor( + std::shared_ptr tensorLHS{ new kp::Tensor( { 0.0, 1.0, 2.0 }) }; - mgr.evalOp({ tensorOne }); + mgr.evalOp({ tensorLHS }); spdlog::info("Creating second tensor"); - std::shared_ptr tensorTwo{ new kp::Tensor( - { 0.0, 1.0, 2.0 }) }; - mgr.evalOp({ tensorTwo }); + std::shared_ptr tensorRHS{ new kp::Tensor( + { 2.0, 4.0, 6.0 }) }; + mgr.evalOp({ tensorRHS }); + + // TODO: Add capabilities for just output tensor types + spdlog::info("Creating output tensor"); + std::shared_ptr tensorOutput{ new kp::Tensor( + { 0.0, 0.0, 0.0 }) }; + mgr.evalOp({ tensorOutput }); spdlog::info("Called manager eval success"); - spdlog::info("Tensor one: {}", tensorOne->data()); - spdlog::info("Tensor two: {}", tensorTwo->data()); + spdlog::info("Tensor one: {}", tensorLHS->data()); + spdlog::info("Tensor two: {}", tensorRHS->data()); + spdlog::info("Tensor two: {}", tensorOutput->data()); + + spdlog::info("Calling op mult"); + mgr.evalOp({ tensorLHS, tensorRHS, tensorOutput }); return 0; } catch (const std::exception& exc) {