Updated documentation examples

2026-05-11 00:49:58 +00:00 · 2021-02-21 12:12:16 +00:00
parent 29c50e5728
commit f474d21088
3 changed files with 22 additions and 271 deletions
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ int main() {
    // 3. Run operation with string shader synchronously
    mgr.evalOpDefault<kp::OpAlgoBase>(
        { tensorInA, tensorInB, tensorOut },
-        std::vector<uint32_t>(shaderString.begin(), shaderString.end()));
+        kp::Shader::compile_source(shaderString));

    // 4. Map results back from GPU memory to print the results
    mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorInA, tensorInB, tensorOut });
--- a/docs/overview/advanced-examples.rst
+++ b/docs/overview/advanced-examples.rst
@@ -45,7 +45,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
        auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));

        // Create tensors data explicitly in GPU with an operation
-        mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+        mgr.rebuild({ tensorA, tensorB });

        // Define your shader as a string (using string literals for simplicity)
        // (You can also pass the raw compiled bytes, or even path to file)
@@ -67,7 +67,7 @@ Pass compute shader data in glsl/hlsl text or compiled SPIR-V format (or as path
        // Run Kompute operation on the parameters provided with dispatch layout
        mgr.evalOpDefault<kp::OpAlgoBase>(
            { tensorA, tensorB }, 
-            std::vector<char>(shader.begin(), shader.end()));
+            kp::Shader::compile_source(shader));

        // Sync the GPU memory back to the local tensor
        mgr.evalOpDefault<kp::OpTensorSyncLocal>({ tensorA, tensorB });
@@ -105,7 +105,7 @@ Record commands in a single submit by using a Sequence to send in batch to GPU.
           sq->begin();

           // Record batch commands to send to GPU
-           sq->record<kp::OpMult<>>({ tensorLHS, tensorRHS, tensorOutput });
+           sq->record<kp::OpMult>({ tensorLHS, tensorRHS, tensorOutput });
           sq->record<kp::OpTensorCopy>({tensorOutput, tensorLHS, tensorRHS});

           // Stop recording
@@ -146,7 +146,7 @@ You can submit operations asynchronously with the async/await commands in the kp
       auto tensor = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));

       // Create tensors data explicitly in GPU with an operation
-       mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+       mgr.rebuild(tensor)

       // Define your shader as a string (using string literals for simplicity)
       // (You can also pass the raw compiled bytes, or even path to file)
@@ -174,6 +174,8 @@ You can submit operations asynchronously with the async/await commands in the kp
           }
       )");

+       std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+
       // We can now await for the previous submitted command
       // The first parameter can be the amount of time to wait
       // The time provided is in nanoseconds
@@ -182,7 +184,7 @@ You can submit operations asynchronously with the async/await commands in the kp
       // Run Async Kompute operation on the parameters provided
       mgr.evalOpAsyncDefault<kp::OpAlgoBase>(
           { tensor }, 
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);

       // Here we can do other work

@@ -234,7 +236,7 @@ Back to `examples list <#simple-examples>`_.
       auto tensorB = std::make_shared<kp::Tensor>(kp::Tensor(std::vector<float>(10, 0.0)));

       // We run the first step synchronously on the default sequence
-       mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+       mgr.rebuild({ tensorA, tensorB });

       // Define your shader as a string (using string literals for simplicity)
       // (You can also pass the raw compiled bytes, or even path to file)
@@ -262,17 +264,19 @@ Back to `examples list <#simple-examples>`_.
           }
       )");

+       std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+
       // Run the first parallel operation in the `queueOne` sequence
       mgr.evalOpAsync<kp::OpAlgoBase>(
           { tensorA }, 
           "queueOne",
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);

       // Run the second parallel operation in the `queueTwo` sequence
       mgr.evalOpAsync<kp::OpAlgoBase>(
           { tensorB }, 
           "queueTwo",
-           std::vector<char>(shader.begin(), shader.end()));
+           spirv);

       // Here we can do other work

@@ -308,7 +312,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
         : OpAlgoBase(physicalDevice, device, commandBuffer, tensors, "")
       {
           // Perform your custom steps such as reading from a shader file
-           this->mShaderFilePath = "shaders/glsl/opmult.comp";
+           this->mShaderFilePath = "shaders/glsl/opmult.comp.spv";
       }
   }

@@ -323,7 +327,7 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
       auto tensorOut = std::make_shared<kp::Tensor>(kp::Tensor({ 0., 0., 0. }));

       // Create tensors data explicitly in GPU with an operation
-       mgr.evalOpDefault<kp::OpTensorCreate>({ tensorLhs, tensorRhs, tensorOut });
+       mgr.rebuild({ tensorLhs, tensorRhs, tensorOut });

       // Run Kompute operation on the parameters provided with dispatch layout
       mgr.evalOpDefault<kp::OpMyCustom<3, 1, 1>>(
@@ -334,258 +338,3 @@ We also provide tools that allow you to `convert shaders into C++ headers <https
   }


-Logistic Regression Example
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Logistic regression is oftens seen as the hello world in machine learning so we will be using it for our examples. Back to `examples list <#simple-examples>`_.
-
-.. image:: ../images/logistic-regression.jpg
-   :width: 300px
-
-
-In summary, we have:
-
-
-* Vector ``X`` with input data (with a pair of inputs ``Xi`` and ``Xj``\ )
-* Output ``Y`` with expected predictions
-
-With this we will:
-
-* Optimize the function simplified as ``Y = WX + b``
-* We'll want our program to learn the parameters ``W`` and ``b``
-
-We will have to convert this into Kompute terminology.
-
-First specifically around the inputs, we will be using the following:
-
-* Two vertors for the variable `X`, vector `Xi` and `Xj`
-* One vector `Y` for the true predictions
-* A vector `W` containing the two input weight values to use for inference
-* A vector `B` containing a single input parameter for `b`
-
-.. code-block:: cpp
-   :linenos:
-
-   std::vector<float> wInVec = { 0.001, 0.001 };
-   std::vector<float> bInVec = { 0 };
-
-   std::shared_ptr<kp::Tensor> xI{ new kp::Tensor({ 0, 1, 1, 1, 1 })};
-   std::shared_ptr<kp::Tensor> xJ{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
-
-   std::shared_ptr<kp::Tensor> y{ new kp::Tensor({ 0, 0, 0, 1, 1 })};
-
-   std::shared_ptr<kp::Tensor> wIn{ 
-       new kp::Tensor(wInVec, kp::Tensor::TensorTypes::eStaging)};
-
-   std::shared_ptr<kp::Tensor> bIn{ 
-       new kp::Tensor(bInVec, kp::Tensor::TensorTypes::eStaging)};
-
-
-We will have the following output vectors:
-
-* Two output vectors `Wi` and `Wj` to store all the deltas to perform gradient descent on W
-* One output vector `Bout` to store all the deltas to perform gradient descent on B
-
-.. code-block:: cpp
-   :linenos:
-
-   std::shared_ptr<kp::Tensor> wOutI{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-   std::shared_ptr<kp::Tensor> wOutJ{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-
-   std::shared_ptr<kp::Tensor> bOut{ new kp::Tensor({ 0, 0, 0, 0, 0 })};
-
-
-For simplicity we will store all the tensors inside a params variable:
-
-.. code-block:: cpp
-   :linenos:
-
-   std::vector<std::shared_ptr<kp::Tensor>> params = 
-       {xI, xJ, y, wIn, wOutI, wOutJ, bIn, bOut};
-
-
-Now that we have the inputs and outputs we will be able to use them in the processing. The workflow we will be using is the following:
-
-1. Create a Sequence to record and submit GPU commands
-2. Submit OpCreateTensor to create all the tensors 
-3. Record the OpAlgo with the Logistic Regression shader
-4. Loop across number of iterations:
-  4-a. Submit algo operation on LR shader
-  4-b. Re-calculate weights from loss
-5. Print output weights and bias
-
-1. Create a sequence to record and submit GPU commands
-
-.. code-block:: cpp
-    :linenos:
-
-    kp::Manager mgr;
-
-    if (std::shared_ptr<kp::Sequence> sq = 
-            mgr.sequence("createTensors").lock()) 
-    {
-        // ...
-
-
-
-Submit OpCreateTensor to create all the tensors
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        sq->begin();
-
-        sq->record<kp::OpCreateTensor>(params);
-
-        sq->end();
-        sq->eval();
-
-
-Record the OpAlgo with the Logistic Regression shader
-
-Once we re-record, all the instructions that were recorded previously are cleared.
-
-Because of this we can record now the new commands which will consist of the following:
-
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        sq->begin();
-
-        sq->record<kp::OpTensorSyncDevice>({wIn, bIn});
-
-        sq->record<kp::OpAlgoBase>(
-                params, 
-                false, // Whether to copy output from device
-                "test/shaders/glsl/test_logistic_regression.comp");
-
-        sq->record<kp::OpTensorSyncLocal>({wOutI, wOutJ, bOut});
-
-        sq->end();
-
-
-
-Loop across number of iterations + 4-a. Submit algo operation on LR shader
-
-.. code-block:: cpp
-    :linenos:
-
-    {
-        // ... continuing from codeblock above
-
-        uint32_t ITERATIONS = 100;
-
-        for (size_t i = 0; i < ITERATIONS; i++) 
-        {
-            // Run evaluation which passes data through shader once
-            sq->eval();
-
-
-
-4-b. Re-calculate weights from loss
-
-
-Once the shader code is executed, we are able to use the outputs from the shader calculation.
-
-In this case we want to basically add all the calculated weights and bias from the back-prop step.
-
-.. code-block:: cpp
-   :linenos:
-
-   {
-       // ... 
-       for (size_t i = 0; i < ITERATIONS; i++) 
-       {
-           // ... continuing from codeblock above
-
-           // Run evaluation which passes data through shader once
-           sq->eval();
-
-           // Subtract the resulting weights and biases
-           for(size_t j = 0; j < bOut->size(); j++) {
-               wInVec[0] -= wOutI->data()[j];
-               wInVec[1] -= wOutJ->data()[j];
-               bInVec[0] -= bOut->data()[j];
-           }
-           // Set the data for the GPU to use in the next iteration
-           wIn->mapDataIntoHostMemory();
-           bIn->mapDataIntoHostMemory();
-       }
-
-5. Print output weights and bias
-
-.. code-block:: cpp
-    :linenos:
-
-    std::cout << "Weight i: " << wIn->data()[0] << std::endl;
-    std::cout << "Weight j: " << wIn->data()[1] << std::endl;
-    std::cout << "Bias: " << bIn->data()[0] << std::endl;
-
-
-
-Logistic Regression Compute Shader
----------------------------------
-
-Finally you can see the shader used for the logistic regression usecase below:
-
-.. code-block:: cpp
-   :linenos:
-
-   #version 450
-
-   layout (constant_id = 0) const uint M = 0;
-
-   layout (local_size_x = 1) in;
-
-   layout(set = 0, binding = 0) buffer bxi { float xi[]; };
-   layout(set = 0, binding = 1) buffer bxj { float xj[]; };
-   layout(set = 0, binding = 2) buffer by { float y[]; };
-   layout(set = 0, binding = 3) buffer bwin { float win[]; };
-   layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
-   layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
-   layout(set = 0, binding = 6) buffer bbin { float bin[]; };
-   layout(set = 0, binding = 7) buffer bbout { float bout[]; };
-
-   float learningRate = 0.1;
-   float m = float(M);
-
-   float sigmoid(float z) {
-       return 1.0 / (1.0 + exp(-z));
-   }
-
-   float inference(vec2 x, vec2 w, float b) {
-       float z = dot(w, x) + b;
-       float yHat = sigmoid(z);
-       return yHat;
-   }
-
-   float calculateLoss(float yHat, float y) {
-       return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
-   }
-
-   void main() {
-       uint idx = gl_GlobalInvocationID.x;
-
-       vec2 wCurr = vec2(win[0], win[1]);
-       float bCurr = bin[0];
-
-       vec2 xCurr = vec2(xi[idx], xj[idx]);
-       float yCurr = y[idx];
-
-       float yHat = inference(xCurr, wCurr, bCurr);
-       float loss = calculateLoss(yHat, yCurr);
-
-       float dZ = yHat - yCurr;
-       vec2 dW = (1. / m) * xCurr * dZ;
-       float dB = (1. / m) * dZ;
-       wouti[idx] = learningRate * dW.x;
-       woutj[idx] = learningRate * dW.y;
-       bout[idx] = learningRate * dB;
-   }
--- a/docs/overview/async-parallel.rst
+++ b/docs/overview/async-parallel.rst
@@ -64,7 +64,7 @@ Sequences can be executed in synchronously or asynchronously without having to c
    :linenos:

    // Create tensors data explicitly in GPU with an operation
-    mgr.evalOpAsyncDefault<kp::OpTensorCreate>({ tensor });
+    mgr.rebuild({ tensor });


 While this is running we can actually do other things like in this case create the shader we'll be using.
@@ -125,7 +125,7 @@ Similar to above we can run other commands such as the `OpAlgoBase` asynchronous
    // Run Async Kompute operation on the parameters provided
    mgr.evalOpAsyncDefault<kp::OpAlgoBase<>>(
        { tensor }, 
-        std::vector<char>(shader.begin(), shader.end()));
+        kp::Shader::compile_source(shader));

    // Here we can do other work

@@ -226,7 +226,7 @@ Similar to the asyncrhonous usecase above, we can still run synchronous commands
    :linenos:

    // We run the first step synchronously on the default sequence
-    mgr.evalOpDefault<kp::OpTensorCreate>({ tensorA, tensorB });
+    mgr.rebuild({ tensorA, tensorB });

    // Define your shader as a string (using string literals for simplicity)
    // (You can also pass the raw compiled bytes, or even path to file)
@@ -259,17 +259,19 @@ Now we can actually trigger the parallel processing, running two OpAlgoBase Oper
 .. code-block:: cpp
    :linenos:

+    std::vector<uint32_t> spirv = kp::Shader::compile_source(shader);
+
    // Run the first parallel operation in the `queueOne` sequence
    mgr.evalOpAsync<kp::OpAlgoBase<>>(
        { tensorA }, 
        "queueOne",
-        std::vector<char>(shader.begin(), shader.end()));
+        spirv);

    // Run the second parallel operation in the `queueTwo` sequence
    mgr.evalOpAsync<kp::OpAlgoBase<>>(
        { tensorB }, 
        "queueTwo",
-        std::vector<char>(shader.begin(), shader.end()));
+        spirv);


 Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing.