mirror of
https://github.com/nomic-ai/kompute.git
synced 2026-05-11 00:49:58 +00:00
Updated documentation examples
This commit is contained in:
@@ -78,7 +78,7 @@ int main() {
|
||||
// 3. Run operation with string shader synchronously
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorInA, tensorInB, tensorOut },
|
||||
std::vector<uint32_t>(shaderString.begin(), shaderString.end()));
|
||||
kp::Shader::compile_source(shaderString));
|
||||
|
||||
// 4. Map results back from GPU memory to print the results
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
|
||||
|
||||
@@ -45,7 +45,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
|
||||
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
|
||||
|
||||
// Create tensors data explicitly in GPU with an operation
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
mgr.rebuild({ tensorA, tensorB });
|
||||
|
||||
// Define your shader as a string (using string literals for simplicity)
|
||||
// (You can also pass the raw compiled bytes, or even path to file)
|
||||
@@ -67,7 +67,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
|
||||
// Run Kompute operation on the parameters provided with dispatch layout
|
||||
mgr.evalOpDefault<kp::OpAlgoBase>(
|
||||
{ tensorA, tensorB },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
kp::Shader::compile_source(shader));
|
||||
|
||||
// Sync the GPU memory back to the local tensor
|
||||
mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
|
||||
@@ -105,7 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.
|
||||
sq->begin();
|
||||
|
||||
// Record batch commands to send to GPU
|
||||
sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
|
||||
sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});
|
||||
|
||||
// Stop recording
|
||||
@@ -146,7 +146,7 @@ You can submit operations asynchronously with the async/await commands in the kp
|
||||
auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
|
||||
|
||||
// Create tensors data explicitly in GPU with an operation
|
||||
mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
|
||||
mgr.rebuild(tensor)
|
||||
|
||||
// Define your shader as a string (using string literals for simplicity)
|
||||
// (You can also pass the raw compiled bytes, or even path to file)
|
||||
@@ -174,6 +174,8 @@ You can submit operations asynchronously with the async/await commands in the kp
|
||||
}
|
||||
)");
|
||||
|
||||
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
|
||||
|
||||
// We can now await for the previous submitted command
|
||||
// The first parameter can be the amount of time to wait
|
||||
// The time provided is in nanoseconds
|
||||
@@ -182,7 +184,7 @@ You can submit operations asynchronously with the async/await commands in the kp
|
||||
// Run Async Kompute operation on the parameters provided
|
||||
mgr.evalOpAsyncDefault<kp::OpAlgoBase>(
|
||||
{ tensor },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
spirv);
|
||||
|
||||
// Here we can do other work
|
||||
|
||||
@@ -234,7 +236,7 @@ Back to `examples list <#simple-examples>`_.
|
||||
auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));
|
||||
|
||||
// We run the first step synchronously on the default sequence
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
mgr.rebuild({ tensorA, tensorB });
|
||||
|
||||
// Define your shader as a string (using string literals for simplicity)
|
||||
// (You can also pass the raw compiled bytes, or even path to file)
|
||||
@@ -262,17 +264,19 @@ Back to `examples list <#simple-examples>`_.
|
||||
}
|
||||
)");
|
||||
|
||||
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
|
||||
|
||||
// Run the first parallel operation in the `queueOne` sequence
|
||||
mgr.evalOpAsync<kp::OpAlgoBase>(
|
||||
{ tensorA },
|
||||
"queueOne",
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
spirv);
|
||||
|
||||
// Run the second parallel operation in the `queueTwo` sequence
|
||||
mgr.evalOpAsync<kp::OpAlgoBase>(
|
||||
{ tensorB },
|
||||
"queueTwo",
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
spirv);
|
||||
|
||||
// Here we can do other work
|
||||
|
||||
@@ -308,7 +312,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
|
||||
: OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "")
|
||||
{
|
||||
// Perform your custom steps such as reading from a shader file
|
||||
this->mShaderFilePath = "shaders/glsl/opmult.comp";
|
||||
this->mShaderFilePath = "shaders/glsl/opmult.comp.spv";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,7 +327,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
|
||||
auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));
|
||||
|
||||
// Create tensors data explicitly in GPU with an operation
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
|
||||
mgr.rebuild({ tensorLhs, tensorRhs, tensorOut });
|
||||
|
||||
// Run Kompute operation on the parameters provided with dispatch layout
|
||||
mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
|
||||
@@ -334,258 +338,3 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
|
||||
}
|
||||
|
||||
|
||||
Logistic Regression Example
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Logistic regression is oftens seen as the hello world in machine learning so we will be using it for our examples. Back to `examples list <#simple-examples>`_.
|
||||
|
||||
.. image:: ../images/logistic-regression.jpg
|
||||
:width: 300px
|
||||
|
||||
|
||||
In summary, we have:
|
||||
|
||||
|
||||
* Vector ``X`` with input data (with a pair of inputs ``Xi`` and ``Xj``\ )
|
||||
* Output ``Y`` with expected predictions
|
||||
|
||||
With this we will:
|
||||
|
||||
* Optimize the function simplified as ``Y = WX + b``
|
||||
* We'll want our program to learn the parameters ``W`` and ``b``
|
||||
|
||||
We will have to convert this into Kompute terminology.
|
||||
|
||||
First specifically around the inputs, we will be using the following:
|
||||
|
||||
* Two vertors for the variable `X`, vector `Xi` and `Xj`
|
||||
* One vector `Y` for the true predictions
|
||||
* A vector `W` containing the two input weight values to use for inference
|
||||
* A vector `B` containing a single input parameter for `b`
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
std::vector<float> wInVec = { 0.001, 0.001 };
|
||||
std::vector<float> bInVec = { 0 };
|
||||
|
||||
std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
|
||||
std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> wIn{
|
||||
new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)};
|
||||
|
||||
std::shared_ptr<kp::Tensor> bIn{
|
||||
new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)};
|
||||
|
||||
|
||||
We will have the following output vectors:
|
||||
|
||||
* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W
|
||||
* One output vector `Bout` to store all the deltas to perform gradient descent on B
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
|
||||
std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
|
||||
|
||||
|
||||
For simplicity we will store all the tensors inside a params variable:
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
std::vector<std::shared_ptr<kp::Tensor>> params =
|
||||
{xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
|
||||
|
||||
|
||||
Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following:
|
||||
|
||||
1. Create a Sequence to record and submit GPU commands
|
||||
2. Submit OpCreateTensor to create all the tensors
|
||||
3. Record the OpAlgo with the Logistic Regression shader
|
||||
4. Loop across number of iterations:
|
||||
4-a. Submit algo operation on LR shader
|
||||
4-b. Re-calculate weights from loss
|
||||
5. Print output weights and bias
|
||||
|
||||
1. Create a sequence to record and submit GPU commands
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
kp::Manager mgr;
|
||||
|
||||
if (std::shared_ptr<kp::Sequence> sq =
|
||||
mgr.sequence("createTensors").lock())
|
||||
{
|
||||
// ...
|
||||
|
||||
|
||||
|
||||
Submit OpCreateTensor to create all the tensors
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
{
|
||||
// ... continuing from codeblock above
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpCreateTensor>(params);
|
||||
|
||||
sq->end();
|
||||
sq->eval();
|
||||
|
||||
|
||||
Record the OpAlgo with the Logistic Regression shader
|
||||
|
||||
Once we re-record, all the instructions that were recorded previously are cleared.
|
||||
|
||||
Because of this we can record now the new commands which will consist of the following:
|
||||
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
{
|
||||
// ... continuing from codeblock above
|
||||
|
||||
sq->begin();
|
||||
|
||||
sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
|
||||
|
||||
sq->record<kp::OpAlgoBase>(
|
||||
params,
|
||||
false, // Whether to copy output from device
|
||||
"test/shaders/glsl/test_logistic_regression.comp");
|
||||
|
||||
sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
|
||||
|
||||
sq->end();
|
||||
|
||||
|
||||
|
||||
Loop across number of iterations + 4-a. Submit algo operation on LR shader
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
{
|
||||
// ... continuing from codeblock above
|
||||
|
||||
uint32_t ITERATIONS = 100;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++)
|
||||
{
|
||||
// Run evaluation which passes data through shader once
|
||||
sq->eval();
|
||||
|
||||
|
||||
|
||||
4-b. Re-calculate weights from loss
|
||||
|
||||
|
||||
Once the shader code is executed, we are able to use the outputs from the shader calculation.
|
||||
|
||||
In this case we want to basically add all the calculated weights and bias from the back-prop step.
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
{
|
||||
// ...
|
||||
for (size_t i = 0; i < ITERATIONS; i++)
|
||||
{
|
||||
// ... continuing from codeblock above
|
||||
|
||||
// Run evaluation which passes data through shader once
|
||||
sq->eval();
|
||||
|
||||
// Subtract the resulting weights and biases
|
||||
for(size_t j = 0; j < bOut->size(); j++) {
|
||||
wInVec[0] -= wOutI->data()[j];
|
||||
wInVec[1] -= wOutJ->data()[j];
|
||||
bInVec[0] -= bOut->data()[j];
|
||||
}
|
||||
// Set the data for the GPU to use in the next iteration
|
||||
wIn->mapDataIntoHostMemory();
|
||||
bIn->mapDataIntoHostMemory();
|
||||
}
|
||||
|
||||
5. Print output weights and bias
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
std::cout << "Weight i: " << wIn->data()[0] << std::endl;
|
||||
std::cout << "Weight j: " << wIn->data()[1] << std::endl;
|
||||
std::cout << "Bias: " << bIn->data()[0] << std::endl;
|
||||
|
||||
|
||||
|
||||
Logistic Regression Compute Shader
|
||||
----------------------------------
|
||||
|
||||
Finally you can see the shader used for the logistic regression usecase below:
|
||||
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
#version 450
|
||||
|
||||
layout (constant_id = 0) const uint M = 0;
|
||||
|
||||
layout (local_size_x = 1) in;
|
||||
|
||||
layout(set = 0, binding = 0) buffer bxi { float xi[]; };
|
||||
layout(set = 0, binding = 1) buffer bxj { float xj[]; };
|
||||
layout(set = 0, binding = 2) buffer by { float y[]; };
|
||||
layout(set = 0, binding = 3) buffer bwin { float win[]; };
|
||||
layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
|
||||
layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
|
||||
layout(set = 0, binding = 6) buffer bbin { float bin[]; };
|
||||
layout(set = 0, binding = 7) buffer bbout { float bout[]; };
|
||||
|
||||
float learningRate = 0.1;
|
||||
float m = float(M);
|
||||
|
||||
float sigmoid(float z) {
|
||||
return 1.0 / (1.0 + exp(-z));
|
||||
}
|
||||
|
||||
float inference(vec2 x, vec2 w, float b) {
|
||||
float z = dot(w, x) + b;
|
||||
float yHat = sigmoid(z);
|
||||
return yHat;
|
||||
}
|
||||
|
||||
float calculateLoss(float yHat, float y) {
|
||||
return -(y * log(yHat) + (1.0 - y) * log(1.0 - yHat));
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint idx = gl_GlobalInvocationID.x;
|
||||
|
||||
vec2 wCurr = vec2(win[0], win[1]);
|
||||
float bCurr = bin[0];
|
||||
|
||||
vec2 xCurr = vec2(xi[idx], xj[idx]);
|
||||
float yCurr = y[idx];
|
||||
|
||||
float yHat = inference(xCurr, wCurr, bCurr);
|
||||
float loss = calculateLoss(yHat, yCurr);
|
||||
|
||||
float dZ = yHat - yCurr;
|
||||
vec2 dW = (1. / m) * xCurr * dZ;
|
||||
float dB = (1. / m) * dZ;
|
||||
wouti[idx] = learningRate * dW.x;
|
||||
woutj[idx] = learningRate * dW.y;
|
||||
bout[idx] = learningRate * dB;
|
||||
}
|
||||
|
||||
@@ -64,7 +64,7 @@ Sequences can be executed in synchronously or asynchronously without having to c
|
||||
:linenos:
|
||||
|
||||
// Create tensors data explicitly in GPU with an operation
|
||||
mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
|
||||
mgr.rebuild({ tensor });
|
||||
|
||||
|
||||
While this is running we can actually do other things like in this case create the shader we'll be using.
|
||||
@@ -125,7 +125,7 @@ Similar to above we can run other commands such as the `OpAlgoBase` asynchronous
|
||||
// Run Async Kompute operation on the parameters provided
|
||||
mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
|
||||
{ tensor },
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
kp::Shader::compile_source(shader));
|
||||
|
||||
// Here we can do other work
|
||||
|
||||
@@ -226,7 +226,7 @@ Similar to the asyncrhonous usecase above, we can still run synchronous commands
|
||||
:linenos:
|
||||
|
||||
// We run the first step synchronously on the default sequence
|
||||
mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
|
||||
mgr.rebuild({ tensorA, tensorB });
|
||||
|
||||
// Define your shader as a string (using string literals for simplicity)
|
||||
// (You can also pass the raw compiled bytes, or even path to file)
|
||||
@@ -259,17 +259,19 @@ Now we can actually trigger the parallel processing, running two OpAlgoBase Oper
|
||||
.. code-block:: cpp
|
||||
:linenos:
|
||||
|
||||
std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
|
||||
|
||||
// Run the first parallel operation in the `queueOne` sequence
|
||||
mgr.evalOpAsync<kp::OpAlgoBase<>>(
|
||||
{ tensorA },
|
||||
"queueOne",
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
spirv);
|
||||
|
||||
// Run the second parallel operation in the `queueTwo` sequence
|
||||
mgr.evalOpAsync<kp::OpAlgoBase<>>(
|
||||
{ tensorB },
|
||||
"queueTwo",
|
||||
std::vector<char>(shader.begin(), shader.end()));
|
||||
spirv);
|
||||
|
||||
|
||||
Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing.
|
||||
|
||||
Reference in New Issue
Block a user