diff --git a/README.md b/README.md index e4a302a..8d8e139 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ int main() { // 3. Run operation with string shader synchronously mgr.evalOpDefault( { tensorInA, tensorInB, tensorOut }, - std::vector(shaderString.begin(), shaderString.end())); + kp::Shader::compile_source(shaderString)); // 4. Map results back from GPU memory to print the results mgr.evalOpDefault({ tensorInA, tensorInB, tensorOut }); diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index 665c4f0..bd9d550 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -45,7 +45,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path auto tensorB = std::make_shared(kp::Tensor({ 0., 0., 0. })); // Create tensors data explicitly in GPU with an operation - mgr.evalOpDefault({ tensorA, tensorB }); + mgr.rebuild({ tensorA, tensorB }); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -67,7 +67,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path // Run Kompute operation on the parameters provided with dispatch layout mgr.evalOpDefault( { tensorA, tensorB }, - std::vector(shader.begin(), shader.end())); + kp::Shader::compile_source(shader)); // Sync the GPU memory back to the local tensor mgr.evalOpDefault({ tensorA, tensorB }); @@ -105,7 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU. sq->begin(); // Record batch commands to send to GPU - sq->record>({ tensorLHS, tensorRHS, tensorOutput }); + sq->record({ tensorLHS, tensorRHS, tensorOutput }); sq->record({tensorOutput, tensorLHS, tensorRHS}); // Stop recording @@ -146,7 +146,7 @@ You can submit operations asynchronously with the async/await commands in the kp auto tensor = std::make_shared(kp::Tensor(std::vector(10, 0.0))); // Create tensors data explicitly in GPU with an operation - mgr.evalOpAsyncDefault({ tensor }); + mgr.rebuild(tensor) // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -174,6 +174,8 @@ You can submit operations asynchronously with the async/await commands in the kp } )"); + std::vector spirv = kp::Shader::compile_source(shader); + // We can now await for the previous submitted command // The first parameter can be the amount of time to wait // The time provided is in nanoseconds @@ -182,7 +184,7 @@ You can submit operations asynchronously with the async/await commands in the kp // Run Async Kompute operation on the parameters provided mgr.evalOpAsyncDefault( { tensor }, - std::vector(shader.begin(), shader.end())); + spirv); // Here we can do other work @@ -234,7 +236,7 @@ Back to `examples list <#simple-examples>`_. auto tensorB = std::make_shared(kp::Tensor(std::vector(10, 0.0))); // We run the first step synchronously on the default sequence - mgr.evalOpDefault({ tensorA, tensorB }); + mgr.rebuild({ tensorA, tensorB }); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -262,17 +264,19 @@ Back to `examples list <#simple-examples>`_. } )"); + std::vector spirv = kp::Shader::compile_source(shader); + // Run the first parallel operation in the `queueOne` sequence mgr.evalOpAsync( { tensorA }, "queueOne", - std::vector(shader.begin(), shader.end())); + spirv); // Run the second parallel operation in the `queueTwo` sequence mgr.evalOpAsync( { tensorB }, "queueTwo", - std::vector(shader.begin(), shader.end())); + spirv); // Here we can do other work @@ -308,7 +312,7 @@ We also provide tools that allow you to `convert shaders into C++ headers mShaderFilePath = "shaders/glsl/opmult.comp"; + this->mShaderFilePath = "shaders/glsl/opmult.comp.spv"; } } @@ -323,7 +327,7 @@ We also provide tools that allow you to `convert shaders into C++ headers (kp::Tensor({ 0., 0., 0. })); // Create tensors data explicitly in GPU with an operation - mgr.evalOpDefault({ tensorLhs, tensorRhs, tensorOut }); + mgr.rebuild({ tensorLhs, tensorRhs, tensorOut }); // Run Kompute operation on the parameters provided with dispatch layout mgr.evalOpDefault>( @@ -334,258 +338,3 @@ We also provide tools that allow you to `convert shaders into C++ headers `_. - -.. image:: ../images/logistic-regression.jpg - :width: 300px - - -In summary, we have: - - -* Vector ``X`` with input data (with a pair of inputs ``Xi`` and ``Xj``\ ) -* Output ``Y`` with expected predictions - -With this we will: - -* Optimize the function simplified as ``Y = WX + b`` -* We'll want our program to learn the parameters ``W`` and ``b`` - -We will have to convert this into Kompute terminology. - -First specifically around the inputs, we will be using the following: - -* Two vertors for the variable `X`, vector `Xi` and `Xj` -* One vector `Y` for the true predictions -* A vector `W` containing the two input weight values to use for inference -* A vector `B` containing a single input parameter for `b` - -.. code-block:: cpp - :linenos: - - std::vector wInVec = { 0.001, 0.001 }; - std::vector bInVec = { 0 }; - - std::shared_ptr xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })}; - std::shared_ptr xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; - - std::shared_ptr y{ new kp::Tensor({ 0, 0, 0, 1, 1 })}; - - std::shared_ptr wIn{ - new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)}; - - std::shared_ptr bIn{ - new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)}; - - -We will have the following output vectors: - -* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W -* One output vector `Bout` to store all the deltas to perform gradient descent on B - -.. code-block:: cpp - :linenos: - - std::shared_ptr wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - std::shared_ptr wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - - std::shared_ptr bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })}; - - -For simplicity we will store all the tensors inside a params variable: - -.. code-block:: cpp - :linenos: - - std::vector> params = - {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut}; - - -Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following: - -1. Create a Sequence to record and submit GPU commands -2. Submit OpCreateTensor to create all the tensors -3. Record the OpAlgo with the Logistic Regression shader -4. Loop across number of iterations: - 4-a. Submit algo operation on LR shader - 4-b. Re-calculate weights from loss -5. Print output weights and bias - -1. Create a sequence to record and submit GPU commands - -.. code-block:: cpp - :linenos: - - kp::Manager mgr; - - if (std::shared_ptr sq = - mgr.sequence("createTensors").lock()) - { - // ... - - - -Submit OpCreateTensor to create all the tensors - -.. code-block:: cpp - :linenos: - - { - // ... continuing from codeblock above - - sq->begin(); - - sq->record(params); - - sq->end(); - sq->eval(); - - -Record the OpAlgo with the Logistic Regression shader - -Once we re-record, all the instructions that were recorded previously are cleared. - -Because of this we can record now the new commands which will consist of the following: - - -.. code-block:: cpp - :linenos: - - { - // ... continuing from codeblock above - - sq->begin(); - - sq->record({wIn, bIn}); - - sq->record( - params, - false, // Whether to copy output from device - "test/shaders/glsl/test_logistic_regression.comp"); - - sq->record({wOutI, wOutJ, bOut}); - - sq->end(); - - - -Loop across number of iterations + 4-a. Submit algo operation on LR shader - -.. code-block:: cpp - :linenos: - - { - // ... continuing from codeblock above - - uint32_t ITERATIONS = 100; - - for (size_t i = 0; i < ITERATIONS; i++) - { - // Run evaluation which passes data through shader once - sq->eval(); - - - -4-b. Re-calculate weights from loss - - -Once the shader code is executed, we are able to use the outputs from the shader calculation. - -In this case we want to basically add all the calculated weights and bias from the back-prop step. - -.. code-block:: cpp - :linenos: - - { - // ... - for (size_t i = 0; i < ITERATIONS; i++) - { - // ... continuing from codeblock above - - // Run evaluation which passes data through shader once - sq->eval(); - - // Subtract the resulting weights and biases - for(size_t j = 0; j < bOut->size(); j++) { - wInVec[0] -= wOutI->data()[j]; - wInVec[1] -= wOutJ->data()[j]; - bInVec[0] -= bOut->data()[j]; - } - // Set the data for the GPU to use in the next iteration - wIn->mapDataIntoHostMemory(); - bIn->mapDataIntoHostMemory(); - } - -5. Print output weights and bias - -.. code-block:: cpp - :linenos: - - std::cout << "Weight i: " << wIn->data()[0] << std::endl; - std::cout << "Weight j: " << wIn->data()[1] << std::endl; - std::cout << "Bias: " << bIn->data()[0] << std::endl; - - - -Logistic Regression Compute Shader ----------------------------------- - -Finally you can see the shader used for the logistic regression usecase below: - -.. code-block:: cpp - :linenos: - - #version 450 - - layout (constant_id = 0) const uint M = 0; - - layout (local_size_x = 1) in; - - layout(set = 0, binding = 0) buffer bxi { float xi[]; }; - layout(set = 0, binding = 1) buffer bxj { float xj[]; }; - layout(set = 0, binding = 2) buffer by { float y[]; }; - layout(set = 0, binding = 3) buffer bwin { float win[]; }; - layout(set = 0, binding = 4) buffer bwouti { float wouti[]; }; - layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; }; - layout(set = 0, binding = 6) buffer bbin { float bin[]; }; - layout(set = 0, binding = 7) buffer bbout { float bout[]; }; - - float learningRate = 0.1; - float m = float(M); - - float sigmoid(float z) { - return 1.0 / (1.0 + exp(-z)); - } - - float inference(vec2 x, vec2 w, float b) { - float z = dot(w, x) + b; - float yHat = sigmoid(z); - return yHat; - } - - float calculateLoss(float yHat, float y) { - return -(y * log(yHat) + (1.0 - y) * log(1.0 - yHat)); - } - - void main() { - uint idx = gl_GlobalInvocationID.x; - - vec2 wCurr = vec2(win[0], win[1]); - float bCurr = bin[0]; - - vec2 xCurr = vec2(xi[idx], xj[idx]); - float yCurr = y[idx]; - - float yHat = inference(xCurr, wCurr, bCurr); - float loss = calculateLoss(yHat, yCurr); - - float dZ = yHat - yCurr; - vec2 dW = (1. / m) * xCurr * dZ; - float dB = (1. / m) * dZ; - wouti[idx] = learningRate * dW.x; - woutj[idx] = learningRate * dW.y; - bout[idx] = learningRate * dB; - } diff --git a/docs/overview/async-parallel.rst b/docs/overview/async-parallel.rst index 1e0178b..0a31ef1 100644 --- a/docs/overview/async-parallel.rst +++ b/docs/overview/async-parallel.rst @@ -64,7 +64,7 @@ Sequences can be executed in synchronously or asynchronously without having to c :linenos: // Create tensors data explicitly in GPU with an operation - mgr.evalOpAsyncDefault({ tensor }); + mgr.rebuild({ tensor }); While this is running we can actually do other things like in this case create the shader we'll be using. @@ -125,7 +125,7 @@ Similar to above we can run other commands such as the `OpAlgoBase` asynchronous // Run Async Kompute operation on the parameters provided mgr.evalOpAsyncDefault>( { tensor }, - std::vector(shader.begin(), shader.end())); + kp::Shader::compile_source(shader)); // Here we can do other work @@ -226,7 +226,7 @@ Similar to the asyncrhonous usecase above, we can still run synchronous commands :linenos: // We run the first step synchronously on the default sequence - mgr.evalOpDefault({ tensorA, tensorB }); + mgr.rebuild({ tensorA, tensorB }); // Define your shader as a string (using string literals for simplicity) // (You can also pass the raw compiled bytes, or even path to file) @@ -259,17 +259,19 @@ Now we can actually trigger the parallel processing, running two OpAlgoBase Oper .. code-block:: cpp :linenos: + std::vector spirv = kp::Shader::compile_source(shader); + // Run the first parallel operation in the `queueOne` sequence mgr.evalOpAsync>( { tensorA }, "queueOne", - std::vector(shader.begin(), shader.end())); + spirv); // Run the second parallel operation in the `queueTwo` sequence mgr.evalOpAsync>( { tensorB }, "queueTwo", - std::vector(shader.begin(), shader.end())); + spirv); Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing.