From 1819b003c2cc9ff14152f6099eaab6eb94c8cb4d Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Thu, 8 Jul 2021 14:44:18 -0400
Subject: [PATCH] Update test json files with new number format and
 multi-device.

---
 scripts/test_cmp.json | 38222 ++++++++++++++++++----------------------
 scripts/test_ref.json | 38222 ++++++++++++++++++----------------------
 2 files changed, 33974 insertions(+), 42470 deletions(-)

diff --git a/scripts/test_cmp.json b/scripts/test_cmp.json
index 592d52c..20c8f8a 100644
--- a/scripts/test_cmp.json
+++ b/scripts/test_cmp.json
@@ -2,21 +2,42 @@
   "devices": [
     {
       "id": 0,
-      "name": "NVIDIA GeForce GTX 1650",
-      "sm_version": 750,
-      "ptx_version": 750,
-      "sm_default_clock_rate": 1560000000,
-      "number_of_sms": 16,
-      "max_blocks_per_sm": 16,
-      "max_threads_per_sm": 1024,
+      "name": "NVIDIA Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
       "max_threads_per_block": 1024,
       "registers_per_sm": 65536,
       "registers_per_block": 65536,
-      "global_memory_size": 4294967296,
-      "global_memory_bus_peak_clock_rate": 4001000000,
-      "global_memory_bus_width": 128,
-      "global_memory_bus_bandwidth": 128032000000,
-      "l2_cache_size": 1048576,
+      "global_memory_size": 34078982144,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "NVIDIA Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
       "shared_memory_per_sm": 65536,
       "shared_memory_per_block": 49152,
       "ecc_state": false
@@ -25,25 +46,7256 @@
   "benchmarks": [
     {
       "index": 0,
-      "name": "cub::DeviceRadixSort::SortKeys - Overview",
+      "name": "simple",
       "min_samples": 10,
       "min_time": 0.5,
       "max_noise": 0.005,
       "skip_time": -1.0,
       "timeout": 0.5,
       "devices": [
-        0
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "486"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010095795164609047"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006114730449640358"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010034803637751827"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005535128658782786"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473929135854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "488"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010075532745901644"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005196761038903798"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010027413077530309"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003559489414701089"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014738126565483"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 1,
+      "name": "single_float64_axis",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
       ],
       "axes": {
-        "Key": {
-          "type": "type",
+        "Duration": {
+          "type": "float64",
           "flags": "",
           "values": [
             {
-              "input_string": "bool",
+              "input_string": "0",
               "description": "",
-              "is_active": true
+              "value": 0.0
             },
+            {
+              "input_string": "0.0001",
+              "description": "",
+              "value": 0.0001
+            },
+            {
+              "input_string": "0.0002",
+              "description": "",
+              "value": 0.0002
+            },
+            {
+              "input_string": "0.0003",
+              "description": "",
+              "value": 0.00030000000000000003
+            },
+            {
+              "input_string": "0.0004",
+              "description": "",
+              "value": 0.0004
+            },
+            {
+              "input_string": "0.0005",
+              "description": "",
+              "value": 0.0005
+            },
+            {
+              "input_string": "0.0006",
+              "description": "",
+              "value": 0.0006000000000000001
+            },
+            {
+              "input_string": "0.0007",
+              "description": "",
+              "value": 0.0007000000000000001
+            },
+            {
+              "input_string": "0.0008",
+              "description": "",
+              "value": 0.0008000000000000001
+            },
+            {
+              "input_string": "0.0009",
+              "description": "",
+              "value": 0.0009000000000000002
+            },
+            {
+              "input_string": "0.001",
+              "description": "",
+              "value": 0.0010000000000000002
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 Duration=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "14050"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "9.162447829181515e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.03536831341378678"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.7685477789450405e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.1242757507930245"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.6396544558213103e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "305626"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "3833"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010862307644142961"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003939155614687134"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010305688671520844"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004761970040891668"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010137620362095862"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5088"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0002": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2173"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002089772070869765"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021045460135476644"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020339080587790604"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024397165593270475"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020172840121363044"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0003": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1519"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030826679394338436"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014114649766999914"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003027125938382783"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0016421113166888573"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003010571695496376"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1742"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0004": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1166"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004085880488850769"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010128960046765418"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004030490282469304"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012417175292930155"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004014095938278854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1304"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0005": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "945"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005090367534391529"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008945139151387666"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005034566609317042"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009551336090877046"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017609577982818"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1044"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0006": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "796"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006083192776381908"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006965661259016556"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000602732541573109"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008277718240662281"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006010903174724053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "872"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0007": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "686"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007086454693877553"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005852496456482882"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007030655056151287"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007134410597090336"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014413925415692"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0008": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "603"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008089611791044773"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005396538572101352"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008033946325529826"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006279991294953942"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008017935563300363"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "654"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0009": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "539"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009083643543599264"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00046818252995143266"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009027890649266406"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005476172787827472"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009011217884181701"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "486"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010087072057613155"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041872799651378016"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010031157275776806"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004937944200398705"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "14964"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "8.23057591553059e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.050523894758860086"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.353875434461126e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05725803858596207"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.355632943746511e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "368832"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "3942"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000107132078640284"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0038634259631888097"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0001024897333569082"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003060119065396588"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0001013761587297066"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5074"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0002": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2208"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002075046254528986"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001976178510512602"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020285036215099666"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015468676288172283"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020172864733863198"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2595"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0003": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1537"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003067784710474956"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0013859894180016342"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003021601552535132"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010334107287380953"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003010565582609204"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1737"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0004": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1176"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004071361462585035"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010022100665278571"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040249948315068836"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007542402160340061"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004014089213396104"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1306"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0005": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "951"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005075497234490012"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007999383507906208"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005028805712670812"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005997416033360081"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017610334757409"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1046"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0006": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "800"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006068307462500002"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000718570216952097"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006021880812197924"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005176533625753945"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006010893901462271"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "873"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0007": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "690"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007071831971014495"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005757596123903934"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007025585162466849"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045240029617132243"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014411477481618"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0008": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "606"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008075822128712869"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005708387835625503"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008029147460319033"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004053734463924786"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00080179443359375"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "655"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0009": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "541"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009068751312384467"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004621968439524866"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009022267493875558"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003405832639595575"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009011221046513186"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "583"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "488"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072327028688517"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004212022611220011"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025862904845687"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003195925477697865"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014755598461355"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 2,
+      "name": "copy_sweep_grid_shape",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "BlockSize": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        },
+        "NumBlocks": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "70"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007152438914285718"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05103516144522029"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007146643659046716"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05105065000806433"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "9390263066.362482"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "75122104530.89986"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.08630756494818458"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006479399461012621"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "78"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "229"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002168227908296944"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0073699479168641575"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021626523655054347"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007425464144896749"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "31030814323.371826"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "248246514586.9746"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.28520969047216754"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002159405241287294"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "243"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010965164419642858"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012922365203073934"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010910109984023236"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013003332317485732"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "61510712631.013084"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "492085701048.1047"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.565355814623282"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010869085366833847"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "486"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "229"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002169319052401745"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003922215380349919"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021638464469576494"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003914908768819146"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "31013690502.09386"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "248109524016.7509"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2850523024089509"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021620516050990224"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "243"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001076449870614035"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011436755828819038"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010709658272956548"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01147415502832018"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62662003109.342606"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501296024874.74084"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5759375285785165"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010696090290923384"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "487"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "500"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009796881099999996"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006630669601394768"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009742486392259615"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006702329573169841"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "68882686922.01393"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "551061495376.1115"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6331129312685104"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009725467921183118"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "542"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "459"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010702333769063192"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009289316979889557"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010647220575212134"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009351871253339476"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63029467198.450455"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "504235737587.60364"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5793149558681108"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010658674782853784"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "492"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "500"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000979172994"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007443039949868979"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009736743034124385"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007443203952714375"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "68923318367.14127"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "551386546937.1301"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6334863820509308"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009717381278159657"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "541"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "474"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010353058628691984"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.02225997387052118"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010297924077712025"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.022347468768318675"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "65167371106.61446"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "521338968852.9157"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5989648079652065"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010282588096414032"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "522"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "76"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0066473009473684225"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010215778080601146"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006642639580525849"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001033363366599417"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "10102740512.482763"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "80821924099.8621"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.11038833601926096"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006642121520223497"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "79"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "216"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0023017428981481495"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022391000845061416"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002297048738709203"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002236143378518806"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "29215254717.542892"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "233722037740.34314"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3192226258472781"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002296809949372944"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "228"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "418"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001179723488038277"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0038851033159110274"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001175016037870252"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0039137311385035334"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "57113147256.81244"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "456905178054.4995"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6240509971242618"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011733407974243164"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "224"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002222525227678572"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0013751517218031045"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002217807294002601"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0013709949704347455"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "30259105099.65222"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "242072840797.21777"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.33062833369375244"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002216307048556171"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "237"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "434"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011343795576036872"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0063161302839560935"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011296571429973376"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006343846096918854"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59406399911.69264"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "475251199293.54114"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6491083906434948"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011274932015425106"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "469"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "437"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011263629336384434"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002337926005235178"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011216670730294026"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002335621873443411"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59829574758.535194"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "478636598068.28156"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6537322416797989"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011199346014793882"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "470"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "439"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011228420820045561"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030427104293198376"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011181568284784197"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030409709404223893"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60017398535.51786"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480139188284.1429"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6557845119702563"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011164653372257314"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "470"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "440"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00112084285909091"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0025157975228794673"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011161849425597621"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0025072621927497768"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60123427078.40005"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480987416627.2004"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6569430406293711"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011137209100238348"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "472"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "464"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010599195581896552"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0019836197107494535"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010552384144273295"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0019838471234599063"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63595925889.809"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "508767407118.472"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.69488555386592"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00105384634014122"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 3,
+      "name": "copy_type_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "T": {
+          "type": "type",
+          "flags": "",
+          "values": [
             {
               "input_string": "U8",
               "description": "uint8_t",
@@ -64,6 +7316,2415 @@
               "description": "uint64_t",
               "is_active": true
             },
+            {
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
+            },
+            {
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 T=U8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "217"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022855767235023037"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00300372701685277"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00228007503588628"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030435512411696388"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "117730974540.34332"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "235461949080.68665"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.27052154076365653"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022790989087975544"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "230"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "342"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014443213274853803"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005313412134692269"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014388333057102406"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0053179597407149795"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "93282333309.4497"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "373129333237.7988"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4286871935176917"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001437488301595052"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "360"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010777073771929832"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011572026564316875"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010722620346044238"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011623749514354471"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62586253951.21598"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "500690031609.72784"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5752413046986763"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001070086296237245"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "490"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U64": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "514"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009532965797665363"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0051196185177249205"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009478394400748767"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005131328754237089"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35400966219.92148"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "566415459518.7437"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6507530555132625"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009458808417792793"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "555"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F32": {
+          "device": 0,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010760199342105259"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010177745756624636"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010705234403150128"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010199116668820488"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62687897782.2779"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501503182258.2232"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5761755310871131"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010701700846354166"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "489"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F64": {
+          "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "514"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009537802023346293"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00826653498170841"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009483462104769544"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008316637345502817"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35382048907.14371"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "566112782514.2993"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6504053107930828"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009457701526988636"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "550"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "184"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0027052529021739146"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003601477829562271"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0027005073000555477"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0036073295118906625"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "99401862751.66835"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "198803725503.3367"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2715304380235696"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0027012594746802137"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "193"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "325"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015229335907692304"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00461287264596262"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001518287454751822"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004612569935175065"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "88400735697.27223"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "353602942789.0889"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.48295856477967786"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001517369088409953"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "346"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "435"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011324326574712646"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006407601007533829"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011277568740406254"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006437993538809961"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59506499623.0584"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "476051996984.4672"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6502021374897116"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011264256719333023"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "469"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U64": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010516201538461542"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002564258397948022"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010469369577546404"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0025731489764373736"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32050097908.439487"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512801566535.0318"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.700395496250863"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010448107472453933"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "502"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F32": {
+          "device": 1,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "435"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001132957098850574"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006385301490538342"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011282652709675928"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006406051844181131"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59479685962.90115"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "475837487703.2092"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6499091560631682"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011265583781452921"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "462"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F64": {
+          "device": 1,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010518258760683764"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002638851741610878"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010471444782028856"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0026422486584488855"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32043746300.974895"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512699940815.5983"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7002566936401856"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010447956953898514"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "505"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 4,
+      "name": "copy_type_conversion_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "In": {
+          "type": "type",
+          "flags": "",
+          "values": [
             {
               "input_string": "I8",
               "description": "int8_t",
@@ -80,13 +9741,13 @@
               "is_active": true
             },
             {
-              "input_string": "I64",
-              "description": "int64_t",
+              "input_string": "F32",
+              "description": "float",
               "is_active": true
             },
             {
-              "input_string": "F32",
-              "description": "float",
+              "input_string": "I64",
+              "description": "int64_t",
               "is_active": true
             },
             {
@@ -96,77 +9757,45 @@
             }
           ]
         },
-        "Input": {
+        "Out": {
           "type": "type",
           "flags": "",
           "values": [
             {
-              "input_string": "Rand",
-              "description": "Random values uniformly distributed across `T`'s value range",
+              "input_string": "I8",
+              "description": "int8_t",
               "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
+            },
             {
-              "input_string": "Ascend",
-              "description": "",
+              "input_string": "I16",
+              "description": "int16_t",
               "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "16",
-              "description": "2^16 = 65536",
-              "value": 65536
             },
             {
-              "input_string": "18",
-              "description": "2^18 = 262144",
-              "value": 262144
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
             },
             {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
             },
             {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
             },
             {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
             }
           ]
         }
       },
       "states": {
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I8": {
           "device": 0,
           "type_config_index": 0,
           "min_samples": 10,
@@ -175,4378 +9804,48 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "bool"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 65536
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3042
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3.927120315581862e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.4351576079801729
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3.2775679267673803e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.4041140207967676
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1999531404.5142384
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3999062809.0284767
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.031234869478165433
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1691
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5.06085156712005e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.3900269997798082
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.116893645171086e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15850062777398677
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6367519362.747736
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12735038725.495472
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09946762313714909
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 537
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00010505661080074492
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13838288384658645
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.64652218021716e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08737598421104262
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10869990037.96822
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21739980075.93644
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.16980114405723912
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 141
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0003081985815602837
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.025128010564140792
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0003021013356269674
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02018452044120483
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13883765165.404953
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 27767530330.809906
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2168796108067507
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 34
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001104020588235294
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00868011645589697
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0010932272953145647
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003434233699996367
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15346503029.978348
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 30693006059.956696
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23972917754902442
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0042372500000000006
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0030855867536742037
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0042257159948349
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019932191078502345
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15881063488.892126
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31762126977.784252
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2480795971146608
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019142950000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019130672454833984
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14031679055.388933
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28063358110.777866
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2191901876935287
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0667937
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06678323364257813
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16078014876.407965
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 32156029752.81593
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2511561933955256
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 65536
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3062
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5.317815153494455e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.3713839348328813
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.7155929496967694e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.40429857529493773
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1389772202.54378
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2779544405.08756
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021709763223940578
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1569
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.891446781389421e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11657350278726154
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.26659224292061e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11037642266139251
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3607523186.0627747
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7215046372.125549
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05635346141687664
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 505
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0001637732673267328
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04440230894077945
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00015781791602620992
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04411047078155534
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6644213954.934341
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13288427909.868683
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1037898955719561
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 129
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005240697674418604
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01553926208912671
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005178778284741924
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015023407302414188
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8099022142.650033
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16198044285.300066
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12651559208088656
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 31
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019522129032258073
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23301756357696535
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001942181168063994
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23453440661836827
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8638337285.8691
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17276674571.7382
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13494028502044958
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0072903875
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007239378588174434
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007273227989673614
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007137742915442734
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9226833545.611362
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18453667091.222725
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14413324083996754
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.029422500000000004
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.029412703514099123
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9126514190.418577
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18253028380.837154
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14256614268961787
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12713770000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12712566375732423
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8446302597.481127
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16892605194.962254
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.131940492962402
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 131072
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2506
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.933663208300092e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.27652333688390907
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.329776243909458e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2908651937565574
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 894106420.4307182
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3576425681.7228727
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027933842177915464
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1269
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00014559330181245089
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19873696733232882
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00013759964715081717
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09333440846518179
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1905121164.3927765
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7620484657.571106
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05952015634818722
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 406
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00038108940886699564
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03821569814913922
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00037408323064813464
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.029659979896029746
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2803055347.290609
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11212221389.162436
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08757358620627996
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 110
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011471336363636359
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01637197025341408
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011375141815705729
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014295623169967478
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3687254249.6207814
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14749016998.483126
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11519789582669275
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 28
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004043317857142858
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01100346103239151
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004035516560077667
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01078543812390853
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4157389952.4964676
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16629559809.98587
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12988596452438353
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015595228571428572
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005255757704044775
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015586642401559011
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00526989300271275
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4305536899.54981
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17222147598.19924
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13451439951105382
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0652593
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0652172622680664
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4116018469.107669
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16464073876.430676
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12859342880241406
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I8"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I16": {
           "device": 0,
-          "type_config_index": 3,
+          "type_config_index": 1,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I16"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4554,11 +9853,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 262144
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -4576,7 +9889,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 2245
+                "value": "775"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4594,7 +9907,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 8.775260579064617e-05
+                "value": "0.000624855941935483"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4612,7 +9925,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.2588213270041119
+                "value": "0.0026998034506879763"
               }
             },
             "Average GPU Time (Cold)": {
@@ -4630,7 +9943,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 8.086750657362002e-05
+                "value": "0.0006194506773641069"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -4648,7 +9961,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.2692444298865416
+                "value": "0.0027411303205337333"
               }
             },
             "Element Throughput": {
@@ -4666,7 +9979,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 810412027.9798346
+                "value": "108336089461.65392"
               }
             },
             "Average Global Memory Throughput": {
@@ -4684,7 +9997,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 6483296223.838676
+                "value": "325008268384.96173"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -4702,39 +10015,77 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.05063809222568324
+                "value": "0.37340104364081084"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006171294842507731"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "849"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^18": {
+        "Device=0 In=I8 Out=I32": {
           "device": 0,
-          "type_config_index": 3,
+          "type_config_index": 2,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4742,11 +10093,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 1048576
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
               }
             },
             "Number of Samples (Cold)": {
@@ -4764,7 +10129,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1055
+                "value": "660"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4782,7 +10147,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00021251706161137438
+                "value": "0.0007373176257575753"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4800,7 +10165,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.06362320460336848
+                "value": "0.0039374442120074255"
               }
             },
             "Average GPU Time (Cold)": {
@@ -4818,7 +10183,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00020548479910145425
+                "value": "0.0007318062056194664"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -4836,7 +10201,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.06357499446836187
+                "value": "0.003968882228293087"
               }
             },
             "Element Throughput": {
@@ -4854,7 +10219,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1275734269.1347759
+                "value": "91703054011.67381"
               }
             },
             "Average Global Memory Throughput": {
@@ -4872,7 +10237,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 10205874153.078207
+                "value": "458515270058.369"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -4890,13 +10255,49 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.07971346345505972
+                "value": "0.5267868451957365"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007297926682692308"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "715"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^20": {
+        "Device=0 In=I8 Out=F32": {
           "device": 0,
           "type_config_index": 3,
           "min_samples": 10,
@@ -4905,24 +10306,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
+              "value": "F32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4930,11 +10333,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 4194304
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
               }
             },
             "Number of Samples (Cold)": {
@@ -4952,7 +10369,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 328
+                "value": "655"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4970,7 +10387,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0006486628048780487
+                "value": "0.0007424307770992365"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4988,7 +10405,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.062073612578813026
+                "value": "0.004224076490124651"
               }
             },
             "Average GPU Time (Cold)": {
@@ -5006,7 +10423,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0006387051705543588
+                "value": "0.0007369820818646267"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -5024,7 +10441,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.019862156160162552
+                "value": "0.004224007291086274"
               }
             },
             "Element Throughput": {
@@ -5042,7 +10459,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1641721483.3095798
+                "value": "91059017106.91382"
               }
             },
             "Average Global Memory Throughput": {
@@ -5060,7 +10477,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 13133771866.476639
+                "value": "455295085534.5691"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -5078,795 +10495,49 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.10258194722004373
+                "value": "0.5230871846674737"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 16777216
+                "type": "float64",
+                "value": "0.0007352958009195734"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 87
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0022856022988505752
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01813131000490793
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002277155692549959
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.018170531150182676
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1841904799.800148
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14735238398.401184
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11509027741815471
+                "value": "706"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 22
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008837786363636363
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007500416221662679
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008825995575297962
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007696619972960275
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1900886518.3385963
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15207092146.70877
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11877571346779532
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 6
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.035352516666666674
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006577426327632489
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03533673604329427
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006429928650474585
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1899124580.090781
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15192996640.726248
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11866561985071113
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1430682
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14305706024169923
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1876422285.9498873
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15011378287.599098
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11724708110159256
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I64": {
           "device": 0,
           "type_config_index": 4,
           "min_samples": 10,
@@ -5875,24 +10546,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U64"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -5900,11 +10573,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 524288
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -5922,7 +10609,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1413
+                "value": "407"
               }
             },
             "Average CPU Time (Cold)": {
@@ -5940,7 +10627,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00018726730360934168
+                "value": "0.0012081452088452092"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -5958,7 +10645,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.19158497176050596
+                "value": "0.009488803674295956"
               }
             },
             "Average GPU Time (Cold)": {
@@ -5976,7 +10663,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00017967336391272188
+                "value": "0.0012026181030624918"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -5994,7 +10681,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.1929620201640373
+                "value": "0.009557481496348873"
               }
             },
             "Element Throughput": {
@@ -6012,7 +10699,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 364750782.04599524
+                "value": "55802306508.69623"
               }
             },
             "Average Global Memory Throughput": {
@@ -6030,7 +10717,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5836012512.735924
+                "value": "502220758578.26605"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -6048,14 +10735,4308 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.045582452142713725
+                "value": "0.5769999524106917"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012018776918068911"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "429"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^18": {
+        "Device=0 In=I8 Out=F64": {
           "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "415"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011839576867469879"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007849889234045669"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011785369482385114"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007894267956792296"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "56942520215.682335"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512482681941.14105"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5887898459801713"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011769149367873732"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "444"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I8": {
+          "device": 0,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I16 Out=I16": {
+          "device": 0,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I16 Out=I32": {
+          "device": 0,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1105"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00043164991312217215"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010807224020750296"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004261522234295282"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010967121645138193"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78738136645.08269"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "472428819870.49615"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.542772081652684"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004235287455769328"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1232"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F32": {
+          "device": 0,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1103"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004325297388939259"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008132056614414322"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004270972975631891"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008246119473642231"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78563906143.74146"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "471383436862.44867"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5415710441893942"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004253676273859287"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1241"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I64": {
+          "device": 0,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "733"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006613082878581173"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007945982231516395"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006558368572759433"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008081739278450547"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51162772612.94874"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "511627726129.48737"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5878075897627383"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006543398455948913"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "794"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F64": {
+          "device": 0,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "734"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006602106062670296"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007402870806519008"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006547767629577938"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007440276584065623"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51245605980.92404"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512456059809.2404"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.588759259891131"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006532440560455698"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "814"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I8": {
+          "device": 0,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I16": {
+          "device": 0,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I32": {
+          "device": 0,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I32 Out=F32": {
+          "device": 0,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1735"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002670209631123916"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013093461911533781"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002615653075986357"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013401071612529727"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "64141594900.43743"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "513132759203.49945"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5895367178349029"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00025968648747412565"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2023"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I64": {
+          "device": 0,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1233"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00038442103811841063"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00844365555913354"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000379033355382238"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008658982321334757"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44263165132.474785"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531157981589.6974"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6102458428190457"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037769805444141474"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1381"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=F64": {
+          "device": 0,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1233"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003844277526358471"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009146640310310188"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037900421011283686"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009309598787035932"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44266568951.84647"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531198827422.15765"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6102927704758245"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003778626589998872"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1385"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=I8": {
+          "device": 0,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I16": {
+          "device": 0,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I32": {
+          "device": 0,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1726"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026862367844727737"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013398984730737511"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002631768242198105"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013647890011152018"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63748835216.53615"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "509990681732.2892"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5859267942696337"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002611921188044063"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1917"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F32": {
+          "device": 0,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=F32 Out=I64": {
+          "device": 0,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1235"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003840306607287451"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009229816202298303"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037859300990336473"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009349966037483663"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44314648081.54371"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531775776978.5245"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6109556261242239"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037728447759194497"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1385"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F64": {
+          "device": 0,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1233"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003844510843471213"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009332888492996015"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037903595060134466"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009455810939030309"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44262862067.25975"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531154344807.117"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6102416645302355"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037756963876577526"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1352"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I8": {
+          "device": 0,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I16": {
+          "device": 0,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I32": {
+          "device": 0,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=F32": {
+          "device": 0,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I64": {
+          "device": 0,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I64 Out=F64": {
+          "device": 0,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1863"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024704845947396656"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008865365856503242"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024161205912269328"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009044628498958832"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "34719326636.507706"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "555509226184.1233"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.638222916112274"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00023959052166815616"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2168"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=I8": {
+          "device": 0,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I16": {
+          "device": 0,
+          "type_config_index": 31,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I32": {
+          "device": 0,
+          "type_config_index": 32,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=F32": {
+          "device": 0,
+          "type_config_index": 33,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I64": {
+          "device": 0,
+          "type_config_index": 34,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1863"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002471723081052067"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008513791485233733"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024173997316990882"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008792904583053216"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "34700955286.79488"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "555215284588.7181"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6378852074778472"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002398694754203643"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2141"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=F64": {
+          "device": 0,
+          "type_config_index": 35,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "704"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006917278508522718"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.031528564248938934"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006868111818859521"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.03156956554686085"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "97710791218.80649"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "293132373656.4195"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4003665505578282"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000659236081199501"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "789"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "568"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008635000933098584"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007603813185561577"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008587616908718169"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0076405577469716105"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78146084895.6489"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "390730424478.2445"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5336680841322177"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008576233512476871"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "608"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=F32": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "568"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008623903292253519"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007683250202065139"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008576831001211219"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0077139887360741476"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78244358540.49461"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "391221792702.473"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5343392055049074"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008563666250191483"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "612"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I64": {
+          "device": 1,
           "type_config_index": 4,
           "min_samples": 10,
           "min_time": 0.5,
@@ -6063,24 +15044,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U64"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -6088,11 +15071,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 2097152
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -6110,7 +15107,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 593
+                "value": "339"
               }
             },
             "Average CPU Time (Cold)": {
@@ -6128,7 +15125,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0005387549747048903
+                "value": "0.0014575299587020652"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -6146,7 +15143,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04111938777242051
+                "value": "0.005325090452654586"
               }
             },
             "Average GPU Time (Cold)": {
@@ -6164,7 +15161,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0005312656186282335
+                "value": "0.0014528400236878067"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -6182,7 +15179,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04063673179425505
+                "value": "0.005339082380112657"
               }
             },
             "Element Throughput": {
@@ -6200,7 +15197,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 493433022.59399897
+                "value": "46191502784.76956"
               }
             },
             "Average Global Memory Throughput": {
@@ -6218,7 +15215,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 7894928361.5039835
+                "value": "415723525062.9261"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -6236,826 +15233,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0616637118962758
+                "value": "0.56780420271925"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 8388608
+                "type": "float64",
+                "value": "0.00145044431581602"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 181
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0018093104972375699
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04001254538809503
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017961900550357542
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01423183226589721
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 583777867.5258992
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9340445880.414387
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0729539949420019
+                "value": "364"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 47
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00701123404255319
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.022882268388176443
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006997501982019303
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023591403635686216
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 599400187.492284
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9590402999.876545
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07490629686231992
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 12
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027476083333333335
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009319641724100018
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02746358140309652
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009143918373427004
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 610889590.6091974
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9774233449.747158
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07634211329782521
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11105753333333335
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11104602813720703
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 604333762.5464745
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9669340200.743591
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07552283960840721
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I8 Out=F64": {
+          "device": 1,
           "type_config_index": 5,
           "min_samples": 10,
           "min_time": 0.5,
@@ -7063,24 +15284,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "F64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -7088,11 +15311,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 65536
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -7110,7 +15347,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 3087
+                "value": "339"
               }
             },
             "Average CPU Time (Cold)": {
@@ -7128,7 +15365,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5.5682150955620366e-05
+                "value": "0.0014595411091445434"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -7146,7 +15383,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.15974697309068234
+                "value": "0.00536065113916752"
               }
             },
             "Average GPU Time (Cold)": {
@@ -7164,7 +15401,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 4.976085047405057e-05
+                "value": "0.0014548183609250722"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -7182,7 +15419,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.1566862558828566
+                "value": "0.005369023538480178"
               }
             },
             "Element Throughput": {
@@ -7200,7 +15437,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1317019290.7811313
+                "value": "46128689190.67507"
               }
             },
             "Average Global Memory Throughput": {
@@ -7218,7 +15455,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 2634038581.5622625
+                "value": "415158202716.0756"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -7236,1330 +15473,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.020573283097680757
+                "value": "0.567032073202682"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 262144
+                "type": "float64",
+                "value": "0.00145255855984158"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 1508
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.478799734748009e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.18692260194964838
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.815668975504499e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07663177112866752
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3354082687.247878
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6708165374.495756
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.052394443377403746
+                "value": "360"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 493
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000172661663286004
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03967760002514821
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0001667072121922918
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03756877297429747
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6289925829.9065
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12579851659.813
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09825552721048644
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 126
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000547765079365079
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02267930122532969
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005414133327347893
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.020125269100866486
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7746953660.734054
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15493907321.468107
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12101589697472591
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 31
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019378612903225812
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011107088633221586
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019301894287909237
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01047020541368632
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8692004913.999191
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17384009827.998383
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13577863212320657
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0075735125
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008208621847337904
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007558403968811036
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006689815940438041
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8878708293.036163
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17757416586.072327
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13869514329286683
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03057025
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03056054401397705
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8783726358.968918
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17567452717.937836
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13721142150351345
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1411317
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14112380981445313
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7608509332.420483
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15217018664.840965
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11885324500781809
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I8": {
+          "device": 1,
           "type_config_index": 6,
           "min_samples": 10,
           "min_time": 0.5,
@@ -8567,1345 +15524,21 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 131072
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2444
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.023633387888718e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15957149507307994
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.409614451997878e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11997847029431298
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 884472470.5254984
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3537889882.1019936
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027632856489799375
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1258
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00014721875993640703
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05030707755378517
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00014142066840858286
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04965983055115567
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1853647016.026198
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7414588064.104792
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05791199125300543
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 395
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00038132582278481046
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.022492628088386854
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000374839979561069
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.020997160180716862
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2797396374.922077
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11189585499.688309
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08739678751943505
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 107
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011724710280373834
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019342776294093793
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011632810418850906
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.016095597179303357
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3605580980.846342
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14422323923.385368
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11264624409042559
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 27
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00414218888888889
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010143570260672414
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004132891301755552
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01004138981574669
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4059437999.9477477
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16237751999.79099
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12682573106560072
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015818842857142856
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004297912986520931
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015806600979396273
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004417201153941865
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4245622704.5571437
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16982490818.228575
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1326425488801907
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06584135
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06581772994995116
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4078467248.9331145
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16313868995.732458
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12742024646754294
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I8"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I16": {
+          "device": 1,
           "type_config_index": 7,
           "min_samples": 10,
           "min_time": 0.5,
@@ -9913,1345 +15546,21 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "I32"
+              "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2237
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.876477425122915e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.35106187182872767
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.160484933232342e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2520628318430057
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 803089528.823398
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6424716230.587184
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05018055041385891
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1033
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002160535333978701
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1188318605057773
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00020900739510142425
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12132837233117388
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1254233133.1041677
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10033865064.833342
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07836997832442938
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 324
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006508543209876547
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.017018531103233366
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006438862221476476
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01711700991339697
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1628511317.578021
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13028090540.624168
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10175651821907154
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 84
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002322446428571428
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0213207920280134
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002313739804994491
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02154045421700801
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1812781191.2757347
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14502249530.205877
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1132705068280264
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 21
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008975690476190476
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008053073918408916
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008964469319298155
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007960830786781505
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1871523611.987053
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14972188895.896423
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11694099050156542
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.035879860000000006
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007059995549678311
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03586985015869141
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0070565543246424054
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1870898922.1617713
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14967191377.29417
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11690195714582426
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1458295
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14581488037109375
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1840933211.458537
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14727465691.668297
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11502956832407756
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I16"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I32": {
+          "device": 1,
           "type_config_index": 8,
           "min_samples": 10,
           "min_time": 0.5,
@@ -11259,24 +15568,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "I64"
+              "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -11284,11 +15595,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 524288
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -11306,7 +15631,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1442
+                "value": "1042"
               }
             },
             "Average CPU Time (Cold)": {
@@ -11324,7 +15649,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00018916165048543735
+                "value": "0.00046152892994241876"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -11342,7 +15667,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.08474054695372502
+                "value": "0.007446740614945881"
               }
             },
             "Average GPU Time (Cold)": {
@@ -11360,7 +15685,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00018243015832403314
+                "value": "0.00045683037259413987"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -11378,7 +15703,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.08522171353418057
+                "value": "0.007515900606668647"
               }
             },
             "Element Throughput": {
@@ -11396,7 +15721,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 359238848.4561566
+                "value": "73450527839.16064"
               }
             },
             "Average Global Memory Throughput": {
@@ -11414,7 +15739,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5747821575.298506
+                "value": "440703167034.96387"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -11432,1014 +15757,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04489363264885736
+                "value": "0.6019219392413733"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 2097152
+                "type": "float64",
+                "value": "0.0004549347768605374"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 588
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005411481292517009
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02452569809177557
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005339167867185301
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023483747081537745
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 490982876.9594332
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7855726031.350931
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.061357520239869186
+                "value": "1156"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 182
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001794480219780219
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01043393384097355
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001785578369439303
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009622529095864946
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 587247257.217429
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9395956115.478865
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0733875602621131
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 47
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007055212765957448
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.022872858397093812
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007045798778533936
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02290392030641084
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 595291482.4616572
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9524663719.386515
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07439283709843254
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 12
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027424558333333335
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0056934303158599825
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027411389350891106
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005639627987530259
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 612052741.4804167
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9792843863.686666
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07648747081734775
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11087960000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11086710357666014
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 605309075.7764491
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9684945212.423185
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07564472329123333
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=F32": {
+          "device": 1,
           "type_config_index": 9,
           "min_samples": 10,
           "min_time": 0.5,
@@ -12447,24 +15808,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
               "type": "string",
               "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -12472,11 +15835,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 262144
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -12494,7 +15871,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 2324
+                "value": "1046"
               }
             },
             "Average CPU Time (Cold)": {
@@ -12512,7 +15889,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 8.308123924268518e-05
+                "value": "0.0004599197934990448"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -12530,7 +15907,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.20123216459376678
+                "value": "0.007608321516935087"
               }
             },
             "Average GPU Time (Cold)": {
@@ -12548,7 +15925,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 7.645772117159609e-05
+                "value": "0.00045521636728916755"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -12566,7 +15943,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.2163426509558451
+                "value": "0.007662230748478094"
               }
             },
             "Element Throughput": {
@@ -12584,7 +15961,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 857153456.7832047
+                "value": "73710952441.8422"
               }
             },
             "Average Global Memory Throughput": {
@@ -12602,7 +15979,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 6857227654.265637
+                "value": "442265714651.0532"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -12620,1172 +15997,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.05355870137360689
+                "value": "0.604056100648838"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 1048576
+                "type": "float64",
+                "value": "0.0004532274742649026"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 1081
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00018608593894542115
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.061054319816048364
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00017904156912376654
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.058580426390462105
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1464151600.563705
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11713212804.50964
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09148660338438547
+                "value": "1168"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 361
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004975271468144047
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014379246069577772
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004902403537091129
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012272733824773483
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2138901851.0340319
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17111214808.272255
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1336479537012017
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 96
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001751761458333334
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005879019262093316
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001744055998822054
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005818847781913598
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2404913605.315918
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19239308842.527344
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15026953294900763
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 24
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006750754166666667
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002444731344335298
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0067418733040491745
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023841178656418723
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2488509534.8682375
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19908076278.9459
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15549297268609333
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 6
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026754200000000006
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006428245182255163
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026743643124898273
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006056040012260031
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2509338899.2138395
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20074711193.710716
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1567944825802199
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10674365000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10673254394531251
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2515029119.3053603
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20120232954.442883
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1571500324484729
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I64": {
+          "device": 1,
           "type_config_index": 10,
           "min_samples": 10,
           "min_time": 0.5,
@@ -13793,776 +16048,2516 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "648"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007539361157407405"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005745552244274178"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007492513590388824"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005767470540109363"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44783945461.29704"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "447839454612.97046"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6116688355181524"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007466272232380319"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "705"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=F64": {
+          "device": 1,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "650"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007517909569230775"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0052243182117119895"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007470858345581929"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005225121011834867"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44913757493.26477"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "449137574932.64764"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6134418363918374"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007439345558090966"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "707"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I8": {
+          "device": 1,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I16": {
+          "device": 1,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I32": {
+          "device": 1,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I32 Out=F32": {
+          "device": 1,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1687"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002777349045643155"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005691592315009916"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000273079350458212"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005675718906016491"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "61437146279.45599"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "491497170235.64795"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6712974899416083"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002715061958589702"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1930"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I64": {
+          "device": 1,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1133"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004230005507502205"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004752057993974487"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041831812217818477"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004789005696384007"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40106357125.149025"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481276285501.78827"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6573375840004757"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041601362464715726"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1251"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=F64": {
+          "device": 1,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1132"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004233390768551238"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004579872590098746"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041865752666346193"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004575612011474384"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40073843013.66299"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480886116163.9559"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6568046822606478"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004163219633556548"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1260"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=I8": {
+          "device": 1,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I16": {
+          "device": 1,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I32": {
+          "device": 1,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1665"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00028157421321321324"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01259188984622349"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002768973456309726"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0128083650869227"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60590021048.303505"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "484720168386.42804"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6620413139019177"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002751490314863719"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1917"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F32": {
+          "device": 1,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=F32 Out=I64": {
+          "device": 1,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1133"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004230867334510152"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004737837167796919"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004184002545904713"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0047584325554732645"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40098484204.84705"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481181810458.1647"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6572085479378342"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004160796998517786"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1265"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F64": {
+          "device": 1,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1132"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004234168127208481"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004661976745113187"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041872537112383403"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0046895661377312735"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40067350003.108116"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480808200037.2974"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6566982627257668"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041628507170249205"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1259"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I8": {
+          "device": 1,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I16": {
+          "device": 1,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I32": {
+          "device": 1,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=F32": {
+          "device": 1,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I64": {
+          "device": 1,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I64 Out=F64": {
+          "device": 1,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1753"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002666010844266969"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004049936615976281"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026192253768546044"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00396300201275967"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32027056831.87056"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512432909309.92896"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6998919762209476"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026010225147830515"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1995"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F64 Out=I8": {
+          "device": 1,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I8"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1446
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00017735048409405266
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11011368701446163
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00017031459973024972
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1112846952116118
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 384793788.1062353
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6156700609.699765
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.048087201712851205
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I16": {
+          "device": 1,
+          "type_config_index": 31,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I16"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 614
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005115635179153098
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014363302118375774
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000504564481760081
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013743392821332084
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 519545091.8097892
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8312721468.956627
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06492690475003614
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I32": {
+          "device": 1,
+          "type_config_index": 32,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
+              "value": "I32"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 187
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017560828877005348
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007271133357282276
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017482294819571758
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007054498367435898
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 599793111.1572946
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9596689778.516714
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07495540004465066
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=F32": {
+          "device": 1,
+          "type_config_index": 33,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
+              "value": "F32"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 50
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0067117
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0020046252309587377
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006701972465515136
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001795830890697078
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 625831279.0125157
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10013300464.20025
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07820935753718017
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I64": {
+          "device": 1,
+          "type_config_index": 34,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -14570,11 +18565,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 134217728
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
               }
             },
             "Number of Samples (Cold)": {
@@ -14592,7 +18601,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 13
+                "value": "1753"
               }
             },
             "Average CPU Time (Cold)": {
@@ -14610,7 +18619,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.02650104615384615
+                "value": "0.00026658457387335985"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -14628,7 +18637,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0007265940345306042
+                "value": "0.004159876144452023"
               }
             },
             "Average GPU Time (Cold)": {
@@ -14646,7 +18655,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.026490544979388894
+                "value": "0.0002619018185261111"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -14664,7 +18673,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00072958672729313
+                "value": "0.004108154173032601"
               }
             },
             "Element Throughput": {
@@ -14682,7 +18691,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 633328457.8725579
+                "value": "32029590505.3583"
               }
             },
             "Average Global Memory Throughput": {
@@ -14700,7 +18709,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 10133255325.960926
+                "value": "512473448085.7328"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -14718,8326 +18727,69 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.07914627066640313
+                "value": "0.6999473449597531"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002600505164606654"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2010"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=F64": {
+          "device": 1,
+          "type_config_index": 35,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10620103333333335
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10618826548258464
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 631980037.4835782
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10111680599.737251
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07897776024538593
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 10,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
+            "Out": {
               "type": "string",
               "value": "F64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 10,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        }
-      }
-    },
-    {
-      "index": 1,
-      "name": "cub::DeviceRadixSort::SortKeys - Constant Values",
-      "min_samples": 10,
-      "min_time": 0.5,
-      "max_noise": 0.005,
-      "skip_time": -1.0,
-      "timeout": 0.5,
-      "devices": [
-        0
-      ],
-      "axes": {
-        "Key": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "U8",
-              "description": "uint8_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U16",
-              "description": "uint16_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U32",
-              "description": "uint32_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U64",
-              "description": "uint64_t",
-              "is_active": true
-            }
-          ]
-        },
-        "Input": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Const",
-              "description": "All values = 42",
-              "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Ascend",
-              "description": "",
-              "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
-            },
-            {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
-            },
-            {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
-            }
-          ]
-        }
-      },
-      "states": {
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2566
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.6390140296181e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.045598791758050095
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.036228858044716e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.033692751648895305
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11604132835.419285
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 23208265670.83857
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1812692582388666
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1160
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0003043432758620684
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014476575112824767
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00029814910513573545
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011313650678305812
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14067806771.013115
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28135613542.02623
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21975454216153953
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 363
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001116191460055097
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011463805762636945
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011063706461063093
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008802868537347153
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15164191185.878504
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 30328382371.757008
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23688126696261097
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 100
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004274347
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0032816652112393967
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004266441283226011
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0032308789304643024
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15729470897.40716
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31458941794.81432
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24571155488326604
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 26
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.017081826923076922
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010556997792740205
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.017073184013366702
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010548301451256467
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15722635906.099308
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31445271812.198616
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24560478483659254
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06782757142857143
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002248092939308053
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06781701987130302
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002251004801379183
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15832925510.994875
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31665851021.98975
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24732762920199441
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1374
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00025053580786026195
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06364830482823801
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002446616114899394
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06504028362673961
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4285821521.4654465
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17143286085.861786
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13389844793381175
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 484
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000845314876033058
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0042877181599594
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0008389603308409695
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004147125806742644
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4999406820.338754
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19997627281.355015
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1561924150318281
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 138
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003167936956521741
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009915660724473297
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0031528834553732395
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001304193155702654
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5321229356.387328
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21284917425.549313
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1662468556731857
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 37
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012415567567567568
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0020929194181039177
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012407164470569507
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0020887980997006564
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5408880019.216801
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21635520076.867203
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1689852542869533
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 10
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04940072
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000576076970718073
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04938988418579101
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005818752831350183
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5435029063.648347
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21740116254.593388
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.16980220768708906
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 804
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00047247549751243796
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012916467373333794
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004640141693291389
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005379896492658131
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2259793060.8800316
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18078344487.040253
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14120176586353608
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 251
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001718760956175299
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008648709803676304
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017113251937813012
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008654677513055743
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2450909982.0662203
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19607279856.529762
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15314358798214323
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 68
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006638683823529413
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002767845306863416
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0066234131490483005
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0015437824831612992
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2533016682.2540236
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20264133458.03219
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15827397414733962
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 18
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02638116111111112
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0035695573142047353
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02635982047186958
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0034495125905173608
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2545877126.576662
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20367017012.613297
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15907755102328555
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10501874
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005592643361183458
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10500200805664064
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00055006156023586
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2556479261.3794527
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20451834091.03562
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15974001883150793
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 254
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017053511811023635
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005201838970683627
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016952332624300263
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0035478470341980604
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 618543785.8250388
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9896700573.20062
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07729864856598835
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 71
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006611254929577467
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003663027154345941
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0066021115477655976
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0036695333152667172
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 635297354.4379919
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10164757671.00787
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07939232122444287
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 19
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026109178947368427
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007116177577395647
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026097561384502206
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007337748011409093
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 642865275.9089972
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10285844414.543955
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08033807496988217
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10435436000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0008827267294601751
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10434051666259767
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009079744888650902
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 643171666.6403677
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10290746666.245884
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08037636423898623
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        }
-      }
-    },
-    {
-      "index": 2,
-      "name": "cub::DeviceRadixSort::SortKeys - Half Word",
-      "min_samples": 10,
-      "min_time": 0.5,
-      "max_noise": 0.005,
-      "skip_time": -1.0,
-      "timeout": 0.5,
-      "devices": [
-        0
-      ],
-      "axes": {
-        "Key": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "U8",
-              "description": "uint8_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U16",
-              "description": "uint16_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U32",
-              "description": "uint32_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U64",
-              "description": "uint64_t",
-              "is_active": true
-            }
-          ]
-        },
-        "Input": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Rand",
-              "description": "Random values uniformly distributed across `T`'s value range",
-              "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Ascend",
-              "description": "",
-              "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
-            },
-            {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
-            },
-            {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
-            }
-          ]
-        },
-        "Bits": {
-          "type": "string",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Half",
-              "description": "",
-              "value": "Half"
-            }
-          ]
-        }
-      },
-      "states": {
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 538
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.960966542750939e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08397008166379398
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.340330078477751e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.045910043356146846
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12572356131.394053
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 25144712262.788105
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19639396606151668
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 131
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00028253587786259543
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04123287912799956
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002716179535589143
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.037808745584459125
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15441924751.451488
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 30883849502.902977
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24121976929910474
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 33
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009550272727272727
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007063558495008569
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009488116340203719
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007360718544290043
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17682346419.92151
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 35364692839.84302
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.27621760840917126
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003773775
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014705369350832631
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0037657760083675386
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014874000832241253
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17820726418.90659
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 35641452837.81318
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2783792554815451
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014729750000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01471895980834961
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18237393096.74077
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 36474786193.48154
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2848880451252932
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06453160000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06452188873291016
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16641512594.97035
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 33283025189.9407
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2599586446352529
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 450
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00025646377777777785
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021618265944340534
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00025038620548115814
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.022009562632867193
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4187834541.3837366
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16751338165.534946
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13083712013820722
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 116
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007816620689655175
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.020853227399617164
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007727365504051081
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.016658278118279097
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5427857654.463388
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21711430617.853554
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.16957815716269023
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 29
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002729172413793104
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007205020379178172
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002722046876775808
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007340614282676166
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6163455943.077721
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24653823772.310883
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19255985825661462
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0105532
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005970051554930724
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010544246673583986
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005939659771636435
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6364500573.390865
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 25458002293.56346
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1988409326852932
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0433476
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04333521652221679
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6194395171.935518
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24777580687.742073
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1935264675061084
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 403
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00035955682382134016
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04131883733262655
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000348795553294956
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.031411058442745776
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3006276857.874047
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24050214862.992374
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.18784534228155753
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 103
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0012605281553398058
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0430687589837951
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0012512904081529784
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03898592591253357
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3351982859.191884
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 26815862873.535072
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.20944656705772832
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 26
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004762250000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004982744499450167
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004748374150349544
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0038811646473003495
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3533254850.771389
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28266038806.171112
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.22077323486449568
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01901851428571429
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010872125017202998
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019009632383074078
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01087158253950949
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3530255748.646293
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28242045989.170345
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.22058583783093558
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07661000000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07659774398803712
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3504482534.6543326
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28035860277.23466
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21897541456225522
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 270
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009441937037037035
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08572346375827458
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009308705164326564
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019541093452539494
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1126446677.0506625
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18023146832.8106
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1407706419708401
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 68
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0036803838235294115
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04345566028172002
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0036717261111035055
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04355542870677633
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1142324855.6901317
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18277197691.042107
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14275491823170852
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 18
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01408933888888889
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017506825580184378
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014078581280178495
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016365268643609497
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1191683712.024376
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19066939392.390015
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1489232331947483
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05700572
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0126927638236131
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05699455947875977
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012703536689392532
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1177460877.2089825
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18839374035.34372
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.147145823195324
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         }
       }
     }
diff --git a/scripts/test_ref.json b/scripts/test_ref.json
index 89b8d8f..557a9c7 100644
--- a/scripts/test_ref.json
+++ b/scripts/test_ref.json
@@ -2,21 +2,42 @@
   "devices": [
     {
       "id": 0,
-      "name": "NVIDIA GeForce GTX 1650",
-      "sm_version": 750,
-      "ptx_version": 750,
-      "sm_default_clock_rate": 1560000000,
-      "number_of_sms": 16,
-      "max_blocks_per_sm": 16,
-      "max_threads_per_sm": 1024,
+      "name": "NVIDIA Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
       "max_threads_per_block": 1024,
       "registers_per_sm": 65536,
       "registers_per_block": 65536,
-      "global_memory_size": 4294967296,
-      "global_memory_bus_peak_clock_rate": 4001000000,
-      "global_memory_bus_width": 128,
-      "global_memory_bus_bandwidth": 128032000000,
-      "l2_cache_size": 1048576,
+      "global_memory_size": 34078982144,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "NVIDIA Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
       "shared_memory_per_sm": 65536,
       "shared_memory_per_block": 49152,
       "ecc_state": false
@@ -25,25 +46,7256 @@
   "benchmarks": [
     {
       "index": 0,
-      "name": "cub::DeviceRadixSort::SortKeys - Overview",
+      "name": "simple",
       "min_samples": 10,
       "min_time": 0.5,
       "max_noise": 0.005,
       "skip_time": -1.0,
       "timeout": 0.5,
       "devices": [
-        0
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "486"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010094132736625523"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005987183296179167"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010034002306039446"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005072701393681687"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473929135854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "488"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010074898913934418"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005542305355933818"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010027081287298028"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00035037919649082367"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014748609703007"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 1,
+      "name": "single_float64_axis",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
       ],
       "axes": {
-        "Key": {
-          "type": "type",
+        "Duration": {
+          "type": "float64",
           "flags": "",
           "values": [
             {
-              "input_string": "bool",
+              "input_string": "0",
               "description": "",
-              "is_active": true
+              "value": 0.0
             },
+            {
+              "input_string": "0.0001",
+              "description": "",
+              "value": 0.0001
+            },
+            {
+              "input_string": "0.0002",
+              "description": "",
+              "value": 0.0002
+            },
+            {
+              "input_string": "0.0003",
+              "description": "",
+              "value": 0.00030000000000000003
+            },
+            {
+              "input_string": "0.0004",
+              "description": "",
+              "value": 0.0004
+            },
+            {
+              "input_string": "0.0005",
+              "description": "",
+              "value": 0.0005
+            },
+            {
+              "input_string": "0.0006",
+              "description": "",
+              "value": 0.0006000000000000001
+            },
+            {
+              "input_string": "0.0007",
+              "description": "",
+              "value": 0.0007000000000000001
+            },
+            {
+              "input_string": "0.0008",
+              "description": "",
+              "value": 0.0008000000000000001
+            },
+            {
+              "input_string": "0.0009",
+              "description": "",
+              "value": 0.0009000000000000002
+            },
+            {
+              "input_string": "0.001",
+              "description": "",
+              "value": 0.0010000000000000002
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 Duration=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "14061"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "9.102689638005845e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.033946388108068055"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.7547417902904438e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.12549022159970946"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.630773172830879e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "306655"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "3835"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010860168552803123"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004007949999262656"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010303751935470811"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004789691009751296"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010137620362095862"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5088"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0002": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2174"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020898149126034966"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002070700973146156"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020338884861017417"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002418204625044133"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002017283984223771"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2583"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0003": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1520"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030825112500000015"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014009307905580174"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030272901975793895"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0016163896900565434"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003010571695496376"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1742"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0004": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1166"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004085718481989706"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010690404823574895"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040307120334734023"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012226190019077351"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004014095938278854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1304"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0005": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "945"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005089798201058188"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008530028319072816"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005034217145707861"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009752402596440034"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017619516657686"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1044"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0006": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "796"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006082355979899511"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007134353357638104"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006027260286424639"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008279817736951732"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006010891975612815"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "872"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0007": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "685"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007086865854014601"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006295331091145095"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007030571342384726"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007151653876403053"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014426981064088"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0008": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "602"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008090872425249167"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005562631850494214"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008034305715085621"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006219681072125149"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008017951428707951"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "654"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0009": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "538"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009084568382899636"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005031047519089767"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009027937730448745"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005501738587938111"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009011217884181701"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "487"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010086481827515403"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004266615566594544"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010031193825253714"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004975122529595318"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15089"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "8.108349592418312e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05461449121054022"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.271210544150035e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.059765735669007766"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.3421442998656208e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "372558"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "3944"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010710262145030443"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004128650771669589"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010247565930403145"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030818570098060543"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010137613820964433"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5117"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0002": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2193"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020765215686274505"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021648763590408093"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020284258628946086"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015447061481155045"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002017285137353667"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2584"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0003": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1537"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003068213201040992"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0013512096196898148"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030219575751114794"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010543163243715088"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003010567871656286"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1736"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0004": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1176"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040721289880952437"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010016437258221326"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004025331704186726"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007456691947680211"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004014084236753499"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1304"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0005": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "951"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005075412103049417"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000846863074833117"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005028813449366248"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006292766848433991"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017613753177333"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1045"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0006": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "800"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006068351487499997"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006561812659454387"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006021752006560568"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000511717182892197"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006010895299747637"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "873"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0007": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "690"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007071279246376804"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005386426703062701"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007025530446266783"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042821786377290075"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014415557371741"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0008": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "605"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008076996363636364"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006014433173443102"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008029008409208492"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00036509958633429017"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008017936496559632"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "654"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0009": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "540"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009070510574074071"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00046472458647248545"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009022252441556363"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00033895812399517745"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009011227322607926"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "488"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073550901639342"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004238073408932392"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025966528986322"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003136332645329908"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473929135854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 2,
+      "name": "copy_sweep_grid_shape",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "BlockSize": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        },
+        "NumBlocks": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "71"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007065658352112677"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05358128799632556"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007059958081849862"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.053589324741995806"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "9505561254.326319"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "76044490034.61055"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.08736729094049925"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006475561071325232"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "81"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "229"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021687765283842793"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006699637202043051"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021633964730141996"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00669331351204079"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "31020141170.19388"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "248161129361.55103"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.28511159163781136"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002160161503025743"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "244"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010963011227678571"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013516109455086892"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001090899714667882"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013565950821979889"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "61516987398.26961"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "492135899186.15686"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5654134871164486"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010871857387360318"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "481"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "229"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002169116519650655"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003880325099879575"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021636720515755057"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0038733421374846436"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "31016190254.495274"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "248129522035.9622"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2850752780744051"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021606314702289093"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "243"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010761263311403508"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011961974879208899"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001070721754902288"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012050980053815875"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62676286993.08928"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501410295944.71423"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5760688142747177"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010696770163143381"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "493"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "500"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000980373466000001"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005650663121151804"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009750024316310896"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005680157515531913"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "68829432443.29456"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "550635459546.3564"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6326234599567514"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009731230225510264"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "542"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "459"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010701848496732027"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008617668166839768"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010647455503218568"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008515610201608317"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63028076501.20161"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "504224612009.61285"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5793021737242795"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010672177234327936"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "498"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "500"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000979696614"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0074291976714003565"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009743501433134098"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0074775515242700395"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "68875510986.00674"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "551004087888.054"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6330469759743267"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009717721991970888"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "541"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "475"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010337088463157895"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.021637984186463816"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010282407758110449"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.02173209936637211"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "65265709723.54853"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "522125677788.38824"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.599868655547321"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010291563019039125"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "508"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "76"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006647754513157893"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011430629751785044"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006643085875009235"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011389249175732911"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "10102061792.16473"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "80816494337.31784"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.11038091993186987"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006639652300484573"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "79"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "216"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002300918597222223"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022260554559899452"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022963019234162794"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022413389898784455"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "29224756255.11826"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "233798050040.94608"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3193264450952607"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022975726211280152"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "228"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "418"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011795720191387577"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0035334409960244696"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001174919423874485"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0035420884521558988"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "57117843688.972115"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "456942749511.7769"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6241023130350974"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011729015622820172"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "224"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022223120000000006"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014441799301084402"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00221759328778301"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001434325968668793"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "30262025218.83109"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "242096201750.6487"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3306602405903747"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002216961359573623"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "236"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "435"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011336455977011492"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006534400600481561"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001129045183631195"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0065638034102788135"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59438599068.433075"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "475508792547.4646"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6494602170938929"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011272204485062364"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "466"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "437"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011265385652173912"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002220966435104119"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011218978122933775"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022003475082832675"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59817269687.70571"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "478538157501.6457"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6535977894198614"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001119863004765959"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "439"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011232369088838266"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00285184985884414"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011185731920403065"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0028360480110887457"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59995058416.86738"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "479960467334.939"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6555404110234635"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011161975045489451"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "440"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011206702840909095"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002536479032620614"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011160453837026254"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0025536971451898373"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60130945371.914566"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481047562975.3165"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6570251898155001"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011138856279089096"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "470"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "464"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010597870474137931"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0020209648798997564"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010551077248207455"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002011195776784625"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63603803120.10441"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "508830424960.83527"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6949716250011408"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010536742918941392"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "498"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 3,
+      "name": "copy_type_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "T": {
+          "type": "type",
+          "flags": "",
+          "values": [
             {
               "input_string": "U8",
               "description": "uint8_t",
@@ -64,6 +7316,2415 @@
               "description": "uint64_t",
               "is_active": true
             },
+            {
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
+            },
+            {
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 T=U8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "217"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002284935774193548"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003019023225421965"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022794654072704396"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030185067855524154"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "117762460945.3669"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "235524921890.7338"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2705938900399056"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022792820785984846"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "231"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "341"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014459254017595295"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005620271181121053"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014404413371491634"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005659383776137258"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "93178197916.5051"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "372712791666.0204"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4282086301309977"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014370339589576198"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "365"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010763392214912279"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009580925422442722"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010708663173412028"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009602261983780735"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62667825958.53892"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501342607668.31134"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5759910474130415"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010690977880559816"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "489"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U64": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "514"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009534325642023344"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007974682202520992"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009479809484593146"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008022855237026269"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35395681795.64538"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "566330908730.326"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6506559153611283"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009457213474094653"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "554"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F32": {
+          "device": 0,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "456"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010769479144736836"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011261863999383217"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001071445541946512"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011287071608158339"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62633947664.836296"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501071581318.69037"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5756796660370983"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001069358981385523"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "490"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F64": {
+          "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "514"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009534943599221791"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006006780711077088"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009480226613900089"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00602313677626831"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35394124388.125755"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "566305990210.0121"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6506272865464293"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009457029259723165"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "552"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "184"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00270240325"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0033226300614619185"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002697714079981265"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0033217171224860604"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "99504783695.18842"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "199009567390.37683"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2718115813351956"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0026982716095753207"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "195"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "325"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015216281538461547"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0046556036312148845"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015169812690294725"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004682337277211795"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "88476852509.76712"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "353907410039.0685"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4833744127500389"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015158526066057275"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "347"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "435"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011331533540229887"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006418753103730108"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011284679349811587"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0064621372230947265"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59469003876.588196"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "475752031012.70557"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6497924374627206"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011265910963430138"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "467"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U64": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010515641474358975"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002762541639974713"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001046885606570122"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002764528097772722"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32051670009.99595"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512826720159.9352"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7004298516170443"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001044835600653889"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "503"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F32": {
+          "device": 1,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "435"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011328659609195397"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006308260028809877"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011281658846756504"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006329740046854081"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59484925853.163795"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "475879406825.31036"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6499664101088701"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011261699270694815"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "470"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F64": {
+          "device": 1,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010518281880341881"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002638709647720786"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010471613009770718"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0026399350413532966"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32043231514.27718"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512691704228.4349"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7002454439308824"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010447449703140563"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "502"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 4,
+      "name": "copy_type_conversion_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 0.5,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "In": {
+          "type": "type",
+          "flags": "",
+          "values": [
             {
               "input_string": "I8",
               "description": "int8_t",
@@ -80,13 +9741,13 @@
               "is_active": true
             },
             {
-              "input_string": "I64",
-              "description": "int64_t",
+              "input_string": "F32",
+              "description": "float",
               "is_active": true
             },
             {
-              "input_string": "F32",
-              "description": "float",
+              "input_string": "I64",
+              "description": "int64_t",
               "is_active": true
             },
             {
@@ -96,77 +9757,45 @@
             }
           ]
         },
-        "Input": {
+        "Out": {
           "type": "type",
           "flags": "",
           "values": [
             {
-              "input_string": "Rand",
-              "description": "Random values uniformly distributed across `T`'s value range",
+              "input_string": "I8",
+              "description": "int8_t",
               "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
+            },
             {
-              "input_string": "Ascend",
-              "description": "",
+              "input_string": "I16",
+              "description": "int16_t",
               "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "16",
-              "description": "2^16 = 65536",
-              "value": 65536
             },
             {
-              "input_string": "18",
-              "description": "2^18 = 262144",
-              "value": 262144
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
             },
             {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
             },
             {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
             },
             {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
             }
           ]
         }
       },
       "states": {
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I8": {
           "device": 0,
           "type_config_index": 0,
           "min_samples": 10,
@@ -175,4378 +9804,48 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "bool"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 65536
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2609
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.044829436565725e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.7344469005707992
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3.3346968211430716e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.7223623748539709
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1965276110.993967
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3930552221.987934
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030699764293207435
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1611
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.803054003724396e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12230911675278
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.140059072073415e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10760267299529298
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6331890329.011988
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12663780658.023975
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09891105862615576
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 508
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00010131082677165365
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.022658862636519568
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.499697684948347e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02014810246126259
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11037993363.319344
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 22075986726.638687
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.17242553991688553
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 134
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0003098910447761196
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02924359345479745
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00030219128372064273
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010038496680263341
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13879632623.280346
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 27759265246.56069
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21681505597476172
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 32
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011546593749999997
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1716310430725688
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011058050058782103
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006801724834036127
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15171947957.204117
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 30343895914.408234
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23700243622225875
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004268425
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003888230871894963
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0042546199560165405
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0018586331551975715
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15773174735.642382
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31546349471.284763
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24639425668024215
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01962395
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019611552238464357
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13687619049.017168
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 27375238098.034336
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21381559374245765
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=bool Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "bool"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0675028
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06749183654785157
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15909210341.886597
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31818420683.773193
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24851928177153518
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 65536
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2970
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5.3952222222222165e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.32079964560195756
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4.756137399435661e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19639288507532943
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1377924868.3559093
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2755849736.7118187
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0215246948943375
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1528
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.000445026177996e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09211021423127838
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.354155020466494e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06982489161891461
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3564569950.8707323
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7129139901.741465
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05568248486113991
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 474
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00017160316455696196
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.062041477145776486
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0001641129443917092
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04734404630253115
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6389355842.018351
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12778711684.036701
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09980873284832464
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 124
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005410080645161291
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0214250036018518
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005334214202819335
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015317548261178274
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7863021319.584713
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15726042639.169426
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12282900086829407
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 30
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001945613333333334
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007356225627552308
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019374335924784336
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007393135530598775
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8659505061.300186
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17319010122.600372
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.135270948845604
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00753255
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003915793151916625
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007524112045764923
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0038120757657901278
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8919173929.337402
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17838347858.674805
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13932726083068925
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030915200000000004
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030904863357543944
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8685864515.704916
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17371729031.409832
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13568271237979437
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1319179
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13190963745117187
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8139980101.131428
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16279960202.262856
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12715540023012104
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 131072
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2295
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.879450980392147e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11816100044470679
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.305174021094968e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12439270947006983
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 789098456.3783966
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3156393825.5135865
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.024653163470957154
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1176
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00015293358843537406
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.044663112370144566
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00014689224612500005
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03577765071007969
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1784600664.1965623
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7138402656.786249
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05575483204813054
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 396
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00038702020202020234
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.029965184414531703
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00038085559544840265
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.029252982130513375
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2753211486.2733016
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11012845945.093206
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08601635485732634
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 106
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011748773584905655
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015819656338631416
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001168463095179144
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015910226957725492
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3589590477.7009206
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14358361910.803682
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11214666576171334
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 27
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004157185185185186
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006148449005215739
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004149304955093949
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005989462876723687
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4043379838.689183
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16173519354.756733
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12632403894929964
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015923899999999998
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0030102828263264473
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015911899294172017
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0030381049232100386
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4217526943.787262
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16870107775.149048
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13176477579940207
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0675835
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06757376098632813
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3972480620.9071484
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15889922483.628593
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12410899215530956
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I8"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I16": {
           "device": 0,
-          "type_config_index": 3,
+          "type_config_index": 1,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I16"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4554,11 +9853,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 262144
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -4576,7 +9889,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 2075
+                "value": "775"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4594,7 +9907,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 9.593363855421674e-05
+                "value": "0.0006248230980645156"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4612,7 +9925,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.14775468420150914
+                "value": "0.0027640779893251216"
               }
             },
             "Average GPU Time (Cold)": {
@@ -4630,7 +9943,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 8.907498377992449e-05
+                "value": "0.0006193935315070645"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -4648,7 +9961,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.15871029125259384
+                "value": "0.0028186397219177456"
               }
             },
             "Element Throughput": {
@@ -4666,7 +9979,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 735739679.3011861
+                "value": "108346084655.93024"
               }
             },
             "Average Global Memory Throughput": {
@@ -4684,7 +9997,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5885917434.409489
+                "value": "325038253967.7907"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -4702,39 +10015,77 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04597223689709985
+                "value": "0.37343549398873016"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006171660299862132"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "850"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^18": {
+        "Device=0 In=I8 Out=I32": {
           "device": 0,
-          "type_config_index": 3,
+          "type_config_index": 2,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4742,11 +10093,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 1048576
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
               }
             },
             "Number of Samples (Cold)": {
@@ -4764,7 +10129,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 955
+                "value": "660"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4782,7 +10147,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0002228811518324608
+                "value": "0.0007372658136363634"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4800,7 +10165,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.025900335416445572
+                "value": "0.004348049843468552"
               }
             },
             "Average GPU Time (Cold)": {
@@ -4818,7 +10183,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00021573310965642857
+                "value": "0.0007317814296845251"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -4836,7 +10201,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.02430793263473743
+                "value": "0.004351029775591727"
               }
             },
             "Element Throughput": {
@@ -4854,7 +10219,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1215131049.737726
+                "value": "91706158803.36154"
               }
             },
             "Average Global Memory Throughput": {
@@ -4872,7 +10237,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 9721048397.901808
+                "value": "458530794016.8077"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -4890,13 +10255,49 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.07592670893137503
+                "value": "0.5268046806259279"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007299521218782687"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "717"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^20": {
+        "Device=0 In=I8 Out=F32": {
           "device": 0,
           "type_config_index": 3,
           "min_samples": 10,
@@ -4905,24 +10306,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U32"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
+              "value": "F32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -4930,11 +10333,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 4194304
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
               }
             },
             "Number of Samples (Cold)": {
@@ -4952,7 +10369,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 323
+                "value": "656"
               }
             },
             "Average CPU Time (Cold)": {
@@ -4970,7 +10387,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0006567151702786381
+                "value": "0.000742387521341463"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -4988,7 +10405,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.010434977776651313
+                "value": "0.0041525675601748364"
               }
             },
             "Average GPU Time (Cold)": {
@@ -5006,7 +10423,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0006496768811538867
+                "value": "0.0007369443420775064"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -5024,7 +10441,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.010470007981899945
+                "value": "0.004193469264853706"
               }
             },
             "Element Throughput": {
@@ -5042,7 +10459,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1613996173.2017174
+                "value": "91063680346.35373"
               }
             },
             "Average Global Memory Throughput": {
@@ -5060,7 +10477,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 12911969385.613739
+                "value": "455318401731.7686"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -5078,795 +10495,49 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.10084954843799784
+                "value": "0.5231139725778592"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 16777216
+                "type": "float64",
+                "value": "0.0007352807822347689"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 83
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023719566265060247
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.028543008422411302
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023625457344284987
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02881928350302021
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1775332404.7352695
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14202659237.882156
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11093054266028927
+                "value": "714"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 21
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009017638095238097
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007349245214913964
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009007660184587752
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007398716674765135
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1862549836.0502186
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14900398688.401749
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11638026968571723
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03635022
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011186633894477735
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0363376838684082
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01114331248801505
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1846811817.8094478
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14774494542.475582
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1153968893907428
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14715240000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14714149475097654
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1824335524.4847984
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14594684195.878387
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11399247216225934
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^16": {
+        "Device=0 In=I8 Out=I64": {
           "device": 0,
           "type_config_index": 4,
           "min_samples": 10,
@@ -5875,24 +10546,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U64"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -5900,11 +10573,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 524288
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -5922,7 +10609,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1308
+                "value": "407"
               }
             },
             "Average CPU Time (Cold)": {
@@ -5940,7 +10627,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00021293440366972473
+                "value": "0.0012095483882063889"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -5958,7 +10645,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.06644665882181848
+                "value": "0.009732185124544102"
               }
             },
             "Average GPU Time (Cold)": {
@@ -5976,7 +10663,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00020585504468154448
+                "value": "0.001204128551248836"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -5994,7 +10681,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.06352394541054457
+                "value": "0.009798212399727946"
               }
             },
             "Element Throughput": {
@@ -6012,7 +10699,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 318359941.5860004
+                "value": "55732308589.8092"
               }
             },
             "Average Global Memory Throughput": {
@@ -6030,7 +10717,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5093759065.376006
+                "value": "501590777308.2828"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -6048,14 +10735,4308 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.03978504643664089
+                "value": "0.576276168782494"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012017273091491842"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "429"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^18": {
+        "Device=0 In=I8 Out=F64": {
           "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "415"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011847366168674703"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011261383409993239"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011792877487389432"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011302242538631406"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "56906267424.351715"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512156406819.16547"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5884149894521662"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011767830588600852"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "440"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I8": {
+          "device": 0,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I16 Out=I16": {
+          "device": 0,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I16 Out=I32": {
+          "device": 0,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1105"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00043142517375565617"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01116818587784149"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042600826737028365"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011332580467569093"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78764743715.25449"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "472588462291.5269"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5429554943606697"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042359266142467694"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1238"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F32": {
+          "device": 0,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1102"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00043289838384754937"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008465395678081931"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042745939692221985"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008617999240612035"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78497354933.81969"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "470984129602.9181"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5411122812533525"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042536910129233627"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1229"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I64": {
+          "device": 0,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "734"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006609588569482289"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007896476276327823"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006555628124472239"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007981909890800989"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51184160179.466095"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "511841601794.66095"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5880533108854101"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006538430490801411"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "806"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F64": {
+          "device": 0,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "734"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006605395899182562"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007740408518735753"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006550883051485072"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007833851008491804"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51221234963.72489"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512212349637.2489"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5884792619913246"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006534532250824923"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "805"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I8": {
+          "device": 0,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I16": {
+          "device": 0,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I32": {
+          "device": 0,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I32 Out=F32": {
+          "device": 0,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1735"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026702492853025945"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01324576727299336"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026161364844278195"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013450268523907918"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "64129742847.37816"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "513037942779.02527"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5894277835236963"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00025957003988639885"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2015"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I64": {
+          "device": 0,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1234"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003841953128038892"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008873245446388355"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003788044850192556"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008932234099031263"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44289908550.44172"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531478902605.3006"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6106145480299869"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037766468619885956"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1381"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=F64": {
+          "device": 0,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1235"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003840312064777327"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009389520289783196"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037863498520754796"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009545097161422792"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44309735379.58624"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531716824555.03485"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6108878958582662"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003773968978051128"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1396"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=I8": {
+          "device": 0,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I16": {
+          "device": 0,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I32": {
+          "device": 0,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1726"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026856249884125153"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01342456387766187"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026315643022814674"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013724796519135959"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63753775598.24316"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "510030204785.94525"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5859722021897349"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002609094005709575"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2047"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F32": {
+          "device": 0,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=F32 Out=I64": {
+          "device": 0,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1235"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003840352834008098"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009209302867708775"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037863381922486526"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009434239106344595"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44309871829.05669"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531718461948.68024"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6108897770550095"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037729541193829834"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1370"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F64": {
+          "device": 0,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1233"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003844534225466336"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009387088977698597"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037907109053659035"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009568452852068391"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44258758894.67376"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531105106736.0851"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6101850950552448"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003776787067281789"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1392"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I8": {
+          "device": 0,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I16": {
+          "device": 0,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I32": {
+          "device": 0,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=F32": {
+          "device": 0,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I64": {
+          "device": 0,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I64 Out=F64": {
+          "device": 0,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1865"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002468652632707771"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008794568336063534"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002414397094508553"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009088437943671243"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "34744110730.913086"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "555905771694.6094"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6386785060829612"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00023926271107803853"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2168"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=I8": {
+          "device": 0,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I16": {
+          "device": 0,
+          "type_config_index": 31,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I32": {
+          "device": 0,
+          "type_config_index": 32,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=F32": {
+          "device": 0,
+          "type_config_index": 33,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I64": {
+          "device": 0,
+          "type_config_index": 34,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1861"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002474318479312196"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009416123268532244"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024199313163148308"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009609928378243537"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "34664653262.864136"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "554634452205.8262"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6372178908614731"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024011272523290366"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2177"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=F64": {
+          "device": 0,
+          "type_config_index": 35,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "715"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006812909104895107"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.029682520209047932"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006765060471488043"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.029725089166496972"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "99199207875.28265"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "297597623625.84796"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4064652857651988"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000659287437142213"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "797"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "566"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008641483356890464"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00815440605473416"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008593197461783684"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008178118032486274"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78095335640.14047"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "390476678200.70233"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5333215119655572"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008574365556141886"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "607"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=F32": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "568"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008621727816901408"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008348927642653206"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000857566987334842"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008407666935430734"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78254952663.88672"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "391274763319.4336"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5344115539218662"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008559337940091401"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "612"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I64": {
+          "device": 1,
           "type_config_index": 4,
           "min_samples": 10,
           "min_time": 0.5,
@@ -6063,24 +15044,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "U64"
+              "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -6088,11 +15071,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 2097152
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -6110,7 +15107,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 577
+                "value": "339"
               }
             },
             "Average CPU Time (Cold)": {
@@ -6128,7 +15125,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0005508495667244364
+                "value": "0.0014581254159292036"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -6146,7 +15143,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.011094070357430255
+                "value": "0.005934832249204677"
               }
             },
             "Average GPU Time (Cold)": {
@@ -6164,7 +15161,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0005439041656679245
+                "value": "0.001453499562620765"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -6182,7 +15179,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.01115097584159383
+                "value": "0.005963799027107206"
               }
             },
             "Element Throughput": {
@@ -6200,7 +15197,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 481967259.21023655
+                "value": "46170542961.153595"
               }
             },
             "Average Global Memory Throughput": {
@@ -6218,7 +15215,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 7711476147.363785
+                "value": "415534886650.3824"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -6236,826 +15233,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.060230849688857356
+                "value": "0.5675465562860337"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 8388608
+                "type": "float64",
+                "value": "0.0014501432381838642"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 180
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0018207355555555549
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009870487088215963
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0018114751981364358
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009874938925190486
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 578851977.150296
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9261631634.404736
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07233841254065183
+                "value": "361"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 46
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007233730434782609
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02377618510012125
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007224911980007006
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023868355689650532
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 580533577.6555624
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9288537242.488998
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07254856006692857
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 12
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02786785833333334
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006751988230344623
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027856205463409427
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00670616725677252
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 602279302.6149144
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9636468841.83863
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0752660963027886
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11277853333333336
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11276614379882811
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 595115357.6708314
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9521845722.733303
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0743708270021034
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 4,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I8 Out=F64": {
+          "device": 1,
           "type_config_index": 5,
           "min_samples": 10,
           "min_time": 0.5,
@@ -7063,24 +15284,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "I8"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "F64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -7088,11 +15311,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 65536
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
               }
             },
             "Number of Samples (Cold)": {
@@ -7110,7 +15347,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 2603
+                "value": "339"
               }
             },
             "Average CPU Time (Cold)": {
@@ -7128,7 +15365,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 6.0122819823280644e-05
+                "value": "0.0014608549616519177"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -7146,7 +15383,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.2017958130718533
+                "value": "0.005454444454530878"
               }
             },
             "Average GPU Time (Cold)": {
@@ -7164,7 +15401,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5.2423252989947936e-05
+                "value": "0.0014561624537527042"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -7182,7 +15419,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.19022823395202412
+                "value": "0.0054738241927221685"
               }
             },
             "Element Throughput": {
@@ -7200,7 +15437,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 1250132265.0192351
+                "value": "46086110671.96002"
               }
             },
             "Average Global Memory Throughput": {
@@ -7218,7 +15455,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 2500264530.0384703
+                "value": "414774996047.64026"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -7236,1330 +15473,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.019528434532292475
+                "value": "0.5665086812276555"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 262144
+                "type": "float64",
+                "value": "0.0014524769206623453"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 1398
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.949971387696734e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11888824393616607
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.337345877622417e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023068268156533414
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3144214044.227182
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6288428088.454364
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.049116065424693545
+                "value": "364"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 481
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00017325010395010385
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0066179185169862416
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00016742034008621916
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00660582250812737
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6263133854.942582
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12526267709.885164
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09783700723166992
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 124
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005489895161290321
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010368472853039202
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005430583213606187
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010216130794040566
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7723487211.26541
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15446974422.53082
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12064932534468586
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 30
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019625433333333338
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007820820087352303
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019545994718869527
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006915737015092503
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8583454687.933291
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17166909375.866583
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13408295875926787
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0075993375
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006229944971220976
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007590148031711578
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006228135758381483
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8841575120.751228
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17683150241.502457
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13811508249111518
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030969200000000002
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03096171188354492
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8669916476.50672
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17339832953.01344
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1354335865487803
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I8 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 5,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1339969
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13392364501953125
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8017567202.889429
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16035134405.778858
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12524317675095958
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I8": {
+          "device": 1,
           "type_config_index": 6,
           "min_samples": 10,
           "min_time": 0.5,
@@ -8567,1345 +15524,21 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 131072
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2279
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.156450197454992e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.060115866604561134
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7.509039476441896e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030698884823584592
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 872761425.8202536
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3491045703.2810144
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02726697781242982
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1080
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00015584981481481477
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1277032897361289
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00014661810500202392
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023857683089802767
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1787937444.672207
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 7151749778.688828
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.055859080375912494
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 388
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000382848969072165
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0119905873372064
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000376419794160066
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010115421873716586
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2785655845.5959177
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11142623382.38367
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08702998767795295
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 107
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011626495327102806
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011046847535561077
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011553172374440123
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010639681594967242
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3630434883.2181773
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14521739532.87271
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1134227344169638
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 26
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004200307692307692
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008829704122151534
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004190125520412738
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008654100653918605
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4003988882.4971046
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16015955529.988419
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.12509337923322622
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015656399999999997
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004407923988907438
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015647593361990793
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004421018196379044
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4288765847.086274
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17155063388.345097
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1339904351126679
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0661545
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06614323425292969
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4058396282.4302044
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16233585129.720818
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1267931855295615
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I16 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 6,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I8"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I16": {
+          "device": 1,
           "type_config_index": 7,
           "min_samples": 10,
           "min_time": 0.5,
@@ -9913,1345 +15546,21 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "I32"
+              "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 262144
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1863
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.472168545356936e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1737926133675047
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.730608265261114e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1873449437251008
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 750646438.4705727
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6005171507.764582
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.046903676485289474
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 919
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002210763873775844
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07035645807624645
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00021252275933638198
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06707277848247957
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1233486713.6986363
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9867893709.58909
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07707365119336643
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 317
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006475441640378547
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014103444072938905
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006398196086898584
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013812890521369204
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1638861931.95476
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 13110895455.63808
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10240326992969008
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 84
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023177226190476186
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019912883301982914
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023088179202306832
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.020342096350492007
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1816645636.3873556
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14533165091.098845
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11351197428063957
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 21
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008913814285714286
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006001791310703415
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008904065495445614
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0060174926866995325
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1884219743.0579844
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15073757944.463875
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11773430036603251
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 6
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03555846666666667
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.008199160296918712
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03554372278849283
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00819692333781414
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1888065141.609935
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15104521132.87948
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11797457770619439
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1451229
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14511042785644532
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1849870198.6156194
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14798961588.924955
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11558799041587224
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 7,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
+              "value": "I16"
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I32": {
+          "device": 1,
           "type_config_index": 8,
           "min_samples": 10,
           "min_time": 0.5,
@@ -11259,24 +15568,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
-              "value": "I64"
+              "value": "I16"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I32"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -11284,11 +15595,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 524288
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -11306,7 +15631,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1141
+                "value": "1042"
               }
             },
             "Average CPU Time (Cold)": {
@@ -11324,7 +15649,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0002058517966695883
+                "value": "0.00046152389539347375"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -11342,7 +15667,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.11032147567669195
+                "value": "0.007516961198942111"
               }
             },
             "Average GPU Time (Cold)": {
@@ -11360,7 +15685,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.00019295587745233125
+                "value": "0.0004568425950928514"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -11378,7 +15703,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0891999662438443
+                "value": "0.0075614567935713"
               }
             },
             "Element Throughput": {
@@ -11396,7 +15721,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 339642413.93056464
+                "value": "73448562722.52853"
               }
             },
             "Average Global Memory Throughput": {
@@ -11414,7 +15739,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5434278622.889034
+                "value": "440691376335.17114"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -11432,1014 +15757,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04244469056867841
+                "value": "0.6019058352479938"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 2097152
+                "type": "float64",
+                "value": "0.00045486935942230756"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 530
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005289122641509438
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.33053905464430366
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0005132295253704184
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02690586292503542
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 510773420.1589437
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8172374722.543099
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06383071983990798
+                "value": "1156"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 177
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017896672316384183
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015371140659983133
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001780063275563514
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.015018523735211514
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 589066700.2655019
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9425067204.24803
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0736149337997378
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 47
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006963521276595745
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021866700638764357
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006949177173857993
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021990225740270965
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 603568436.2428534
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9657094979.885654
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07542719773092395
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 12
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027483341666666675
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009214836696393284
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.027467103958129877
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.009243931693158285
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 610811246.2666156
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9772979940.26585
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07633232270265129
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 3
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11026343333333334
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11025037638346354
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 608695101.1086586
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9739121617.738537
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07606787067091458
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=I64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 8,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "I64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=F32": {
+          "device": 1,
           "type_config_index": 9,
           "min_samples": 10,
           "min_time": 0.5,
@@ -12447,24 +15808,26 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
               "type": "string",
               "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -12472,11 +15835,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 262144
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
               }
             },
             "Number of Samples (Cold)": {
@@ -12494,7 +15871,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 1764
+                "value": "1047"
               }
             },
             "Average CPU Time (Cold)": {
@@ -12512,7 +15889,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 9.839484126984118e-05
+                "value": "0.00045967601432664773"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -12530,7 +15907,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.2859180404938184
+                "value": "0.007580415029008197"
               }
             },
             "Average GPU Time (Cold)": {
@@ -12548,7 +15925,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 9.14749023493997e-05
+                "value": "0.00045502618507418957"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -12566,7 +15943,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.3060250465322957
+                "value": "0.007602410404504316"
               }
             },
             "Element Throughput": {
@@ -12584,7 +15961,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 716436949.5545034
+                "value": "73741760585.77625"
               }
             },
             "Average Global Memory Throughput": {
@@ -12602,7 +15979,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 5731495596.436028
+                "value": "442450563514.6575"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -12620,1172 +15997,50 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.04476611781770204
+                "value": "0.6043085712339618"
               }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
+            "Average GPU Time (Batch)": {
               "hint": {
                 "type": "string",
-                "value": "bytes"
+                "value": "duration"
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
               },
               "value": {
-                "type": "int64",
-                "value": 1048576
+                "type": "float64",
+                "value": "0.0004530724069916505"
               }
             },
-            "Number of Samples (Cold)": {
+            "Number of Samples (Batch)": {
               "hint": {
                 "type": "string",
                 "value": "sample_size"
               },
               "short_name": {
                 "type": "string",
-                "value": "Samples"
+                "value": "Batch"
               },
               "description": {
                 "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
+                "value": "Number of kernel executions in hot time measurements."
               },
               "value": {
                 "type": "int64",
-                "value": 1059
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0001759974504249292
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06830900199777254
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00016901520620662156
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0683105078072142
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1551008373.0544827
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12408066984.435862
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.09691379486718837
+                "value": "1164"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 360
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004911733333333335
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021226057185570184
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00048378142292300816
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.021253466020927158
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2167458174.942936
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17339665399.543488
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13543227786446738
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 96
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017458687499999996
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003986863328356874
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017372319946686425
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0034632504917798987
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2414360323.130024
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19314882585.04019
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15085980524431541
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 24
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006740016666666665
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016710482080877498
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006730861365795134
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016728064092106185
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2492580828.54869
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19940646628.38952
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15574736494305735
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 6
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026749483333333338
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000816878555365262
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026738741238911946
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007800612715292246
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2509798924.353957
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20078391394.831657
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15682322696538098
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10676805
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10674793624877929
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2514666469.7517247
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20117331758.013798
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1571273725163537
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F32 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 9,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^16": {
-          "device": 0,
+        "Device=1 In=I16 Out=I64": {
+          "device": 1,
           "type_config_index": 10,
           "min_samples": 10,
           "min_time": 0.5,
@@ -13793,776 +16048,2516 @@
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "648"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007539600570987655"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005701338376763893"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000749293333218421"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005730659247124155"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44781436738.365845"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "447814367383.65845"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6116345708365091"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007462590063859665"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "701"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=F64": {
+          "device": 1,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "650"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007515365646153841"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005320261152122883"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007468673968315132"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00533121216008688"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44926893505.259796"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "449268935052.59796"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6136212508913325"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007440289011028757"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "702"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I8": {
+          "device": 1,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I16": {
+          "device": 1,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I32": {
+          "device": 1,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I32 Out=F32": {
+          "device": 1,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1688"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027765218187203764"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005690620491369388"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027302053109941316"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005713997774637474"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "61450382256.75059"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "491603058054.0047"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6714421138193901"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027140032503120137"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1928"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I64": {
+          "device": 1,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1134"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000422905379188712"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004818481737573335"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004182333121013812"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004829428135064118"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40114489961.844894"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481373879542.13873"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6574708800564614"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004160488643510754"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1267"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=F64": {
+          "device": 1,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1132"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004233320008833917"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004685003714910728"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041865130761381596"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004676709118042214"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40074438309.11453"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480893259709.37445"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6568144390698405"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041636213471617884"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1264"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=I8": {
+          "device": 1,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I16": {
+          "device": 1,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I32": {
+          "device": 1,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1665"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002817099831831833"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012603278274487326"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002770048382224978"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0127786417628205"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60566508901.63906"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "484532071213.1125"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6617844067049723"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002751834324989535"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1941"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F32": {
+          "device": 1,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=F32 Out=I64": {
+          "device": 1,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1133"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004230943777581643"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004719817832949844"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041844157444515244"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004751688895767683"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40094524599.393234"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481134295192.7188"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.657143650558237"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004160357588015425"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1252"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F64": {
+          "device": 1,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1132"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042342536395759757"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004748798224952708"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041871643782504436"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004750041166889743"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40068204838.45002"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480818458061.40027"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6567122733574632"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041632065453087554"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1252"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I8": {
+          "device": 1,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I16": {
+          "device": 1,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I32": {
+          "device": 1,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=F32": {
+          "device": 1,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I64": {
+          "device": 1,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I64 Out=F64": {
+          "device": 1,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1753"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002666450433542495"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004046628770937376"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026198611749762206"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004009600477982423"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32019284380.88381"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512308550094.1409"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6997221237081251"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026007244216493403"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2008"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F64 Out=I8": {
+          "device": 1,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 0.5,
+          "axis_values": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 65536
+              "value": "I8"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 524288
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1336
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00016353682634730543
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.11022562313886396
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00015515176088599353
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10908954362964919
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 422399330.9889422
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6758389295.823075
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.052786719693694355
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^18": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I16": {
+          "device": 1,
+          "type_config_index": 31,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 262144
+              "value": "I16"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 587
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004996207836456557
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.024762621528400496
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004879914401844776
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01040682032664116
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 537189750.5023869
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8595036008.03819
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06713193582884115
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I32": {
+          "device": 1,
+          "type_config_index": 32,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
+              "value": "I32"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 188
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017351010638297873
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012349562772288485
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017246832353003474
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004554329966154549
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 607981789.6631866
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9727708634.610985
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07597872902564191
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=F32": {
+          "device": 1,
+          "type_config_index": 33,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
+              "value": "F32"
             }
           },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 50
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0066887959999999995
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0030118761449178105
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006673763227462767
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0026885930302360814
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 628476596.6434491
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10055625546.295185
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07853993959553225
-              }
-            }
-          },
-          "is_skipped": false
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=I64": {
+          "device": 1,
+          "type_config_index": 34,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
+            "Out": {
               "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
+              "value": "I64"
             }
           },
           "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
             "Input Buffer Size: ": {
               "hint": {
                 "type": "string",
@@ -14570,11 +18565,25 @@
               },
               "short_name": {
                 "type": "string",
-                "value": "Size"
+                "value": "InSize"
               },
               "value": {
                 "type": "int64",
-                "value": 134217728
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
               }
             },
             "Number of Samples (Cold)": {
@@ -14592,7 +18601,7 @@
               },
               "value": {
                 "type": "int64",
-                "value": 13
+                "value": "1753"
               }
             },
             "Average CPU Time (Cold)": {
@@ -14610,7 +18619,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.026423876923076926
+                "value": "0.00026657142213348556"
               }
             },
             "CPU Relative Standard Deviation (Cold)": {
@@ -14628,7 +18637,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0007777926505595894
+                "value": "0.004288873685096382"
               }
             },
             "Average GPU Time (Cold)": {
@@ -14646,7 +18655,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.026412524149968072
+                "value": "0.0002619141041552483"
               }
             },
             "GPU Relative Standard Deviation (Cold)": {
@@ -14664,7 +18673,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.0007825372143057571
+                "value": "0.00422427515777647"
               }
             },
             "Element Throughput": {
@@ -14682,7 +18691,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 635199267.7693503
+                "value": "32028088090.39048"
               }
             },
             "Average Global Memory Throughput": {
@@ -14700,7 +18709,7 @@
               },
               "value": {
                 "type": "float64",
-                "value": 10163188284.309605
+                "value": "512449409446.2477"
               }
             },
             "Percent Peak Global Memory Throughput": {
@@ -14718,8326 +18727,69 @@
               },
               "value": {
                 "type": "float64",
-                "value": 0.07938006345530496
+                "value": "0.6999145124648269"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026007216520352087"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2013"
               }
             }
           },
           "is_skipped": false
         },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 10,
+        "Device=1 In=F64 Out=F64": {
+          "device": 1,
+          "type_config_index": 35,
           "min_samples": 10,
           "min_time": 0.5,
           "max_noise": 0.005,
           "skip_time": -1.0,
           "timeout": 0.5,
           "axis_values": {
-            "Key": {
+            "In": {
               "type": "string",
               "value": "F64"
             },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 4
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10656022500000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10654843139648437
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 629843753.8726102
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10077500061.961763
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07871079153619223
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 10,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
+            "Out": {
               "type": "string",
               "value": "F64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
             }
           },
           "summaries": null,
           "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=F64 Input=Rand Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 10,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "F64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        }
-      }
-    },
-    {
-      "index": 1,
-      "name": "cub::DeviceRadixSort::SortKeys - Constant Values",
-      "min_samples": 10,
-      "min_time": 0.5,
-      "max_noise": 0.005,
-      "skip_time": -1.0,
-      "timeout": 0.5,
-      "devices": [
-        0
-      ],
-      "axes": {
-        "Key": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "U8",
-              "description": "uint8_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U16",
-              "description": "uint16_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U32",
-              "description": "uint32_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U64",
-              "description": "uint64_t",
-              "is_active": true
-            }
-          ]
-        },
-        "Input": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Const",
-              "description": "All values = 42",
-              "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Ascend",
-              "description": "",
-              "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
-            },
-            {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
-            },
-            {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
-            }
-          ]
-        }
-      },
-      "states": {
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2131
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.847855466916956e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1166410494531917
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.792886719743954e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02950844558658552
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 11925275889.7198
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 23850551779.4396
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1862858643107942
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1103
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00030513427017225814
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03970046347053595
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000298017248173573
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010246113439079385
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 14074031035.804775
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28148062071.60955
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21985177199145176
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 364
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011047906593406591
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004165150297152156
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001098251174111943
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0041277328508611206
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15276301446.767155
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 30552602893.53431
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.23863255196774485
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 99
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004277376767676769
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007146089620860907
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004269709249939582
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007192485269555904
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15717431813.641554
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31434863627.283108
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24552349121534545
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 26
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.016944819230769233
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004125824124512113
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.016930290442246653
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004190066574775472
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15855336735.993914
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31710673471.987827
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24767771707063724
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0677445
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004186949546309279
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06773285457066126
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004172460379707014
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15852599610.72267
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31705199221.44534
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24763496017749734
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1317
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00024883545937737277
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014216253160288001
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00024280052241178212
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014413629746702477
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4318672750.718583
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17274691002.874332
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13492479226189025
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 479
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0008450308977035485
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004254901627868165
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0008386715661757668
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004159295165978451
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5001128175.986078
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20004512703.944313
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15624619395107717
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 139
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.003159730215827337
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0012442254198139842
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00315222075688753
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0011064757684230242
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5322348050.447346
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21289392201.789383
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1662818061249483
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 36
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012422872222222228
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0023351107937830437
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012409479114744399
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016255550269386443
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5407871142.654505
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21631484570.61802
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1689537347742597
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 10
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.049459970000000006
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006367677998496455
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04944267120361327
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0006514143080448675
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5429226404.344892
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21716905617.379566
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.16962091990580141
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 781
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004726912932138284
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013777372000949644
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0004643977672510366
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.005762605261042134
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2257926445.6996794
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18063411565.597435
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14108513157333663
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 250
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017142272
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006995479034955076
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017067146277427662
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007040239153063608
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2457530937.9912105
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19660247503.929684
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15355729430087545
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 68
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006638291176470588
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007984223254281496
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006619076686746933
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0014220730690780873
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2534676178.2639914
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20277409426.11193
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.15837766672481826
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 18
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02635283888888889
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0019931714593177012
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026339953634474014
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0020022842651574563
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2547797347.379048
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20382378779.032383
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1591975348274836
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10489146
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007390240134400786
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10486280975341797
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00048311871799088955
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 2559872815.07351
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 20478982520.58808
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1599520629263628
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^20": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 257
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0017021642023346295
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004191102519162322
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0016938885023621748
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002849428031422373
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 619034841.1585128
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9904557458.536205
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07736001514102885
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^22": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 70
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006613181428571428
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00494400837539835
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006595767314093454
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0035342650950247294
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 635908424.3372039
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10174534789.395262
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07946868587068281
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^24": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 19
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026098378947368425
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0013667184743594541
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.026086989252190843
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0013380879892475642
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 643125806.4244041
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10290012902.790466
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08037063314476432
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^26": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10427278
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000645711511791114
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.10425362091064454
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000694228812251099
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 643707752.4388223
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 10299324039.021156
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.08044335821529897
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^28": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Const Pattern=Ascend Elements=2^30": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Const"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        }
-      }
-    },
-    {
-      "index": 2,
-      "name": "cub::DeviceRadixSort::SortKeys - Half Word",
-      "min_samples": 10,
-      "min_time": 0.5,
-      "max_noise": 0.005,
-      "skip_time": -1.0,
-      "timeout": 0.5,
-      "devices": [
-        0
-      ],
-      "axes": {
-        "Key": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "U8",
-              "description": "uint8_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U16",
-              "description": "uint16_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U32",
-              "description": "uint32_t",
-              "is_active": true
-            },
-            {
-              "input_string": "U64",
-              "description": "uint64_t",
-              "is_active": true
-            }
-          ]
-        },
-        "Input": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Rand",
-              "description": "Random values uniformly distributed across `T`'s value range",
-              "is_active": true
-            }
-          ]
-        },
-        "Pattern": {
-          "type": "type",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Ascend",
-              "description": "",
-              "is_active": true
-            }
-          ]
-        },
-        "Elements": {
-          "type": "int64",
-          "flags": "pow2",
-          "values": [
-            {
-              "input_string": "20",
-              "description": "2^20 = 1048576",
-              "value": 1048576
-            },
-            {
-              "input_string": "22",
-              "description": "2^22 = 4194304",
-              "value": 4194304
-            },
-            {
-              "input_string": "24",
-              "description": "2^24 = 16777216",
-              "value": 16777216
-            },
-            {
-              "input_string": "26",
-              "description": "2^26 = 67108864",
-              "value": 67108864
-            },
-            {
-              "input_string": "28",
-              "description": "2^28 = 268435456",
-              "value": 268435456
-            },
-            {
-              "input_string": "30",
-              "description": "2^30 = 1073741824",
-              "value": 1073741824
-            }
-          ]
-        },
-        "Bits": {
-          "type": "string",
-          "flags": "",
-          "values": [
-            {
-              "input_string": "Half",
-              "description": "",
-              "value": "Half"
-            }
-          ]
-        }
-      },
-      "states": {
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1048576
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 506
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 9.122292490118582e-05
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14210402239874367
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 8.171724868209468e-05
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.042136506336878675
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 12831758495.434473
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 25663516990.868946
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.20044611496242304
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 131
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002768381679389314
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.040993814974847456
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00026943584525858166
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04165661510853529
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 15566985884.801865
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 31133971769.60373
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.24317336110975168
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 33
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009524848484848485
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0029631519174044907
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009448620568622242
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0027350116544227862
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17756259634.041355
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 35512519268.08271
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.27737221372846405
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 8
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0037811125
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.017870484219602495
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0037547920346260076
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011926878925089247
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17872857772.45032
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 35745715544.90064
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2791936042934629
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014637200000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014622128009796141
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18358166186.218643
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 36716332372.43729
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2867746529964172
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U8 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 0,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U8"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 1
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0650038
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.06499456024169922
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16520487560.91296
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 33040975121.82592
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.2580681011139865
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 2097152
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 451
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002551281596452328
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.02001747605537379
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0002489812859476538
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.020267049724350405
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 4211465114.7735424
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 16845860459.09417
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.13157539098892596
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 116
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0007822155172413793
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023070481778936412
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.000775460692315266
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023745726673375194
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 5408789951.01249
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 21635159804.04996
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.16898244035905055
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 29
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0027147862068965514
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.006919240505320664
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0027074648265180915
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00697027260615485
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6196651507.962958
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24786606031.851833
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19359696038374652
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010597285714285715
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007928262423253452
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.010588553019932338
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.007853809341114676
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6337869194.560527
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 25351476778.242107
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19800891010249083
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04330945
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04329145622253418
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 6200656651.976361
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24802626607.905445
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.19372208985179834
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U16 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 1,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U16"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 4194304
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 393
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00035548193384223926
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03038537754729681
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00034845801535756846
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.030622610260206715
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3009188923.159104
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 24073511385.27283
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.18802730087222594
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 16777216
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 102
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0012527852941176467
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.03836046238706917
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.001244852706497791
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0384242586382674
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3369317492.830179
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 26954539942.641434
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21052971087416766
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 67108864
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 26
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.004759565384615385
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.012430636692892466
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.00474302521118751
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.011928461924356468
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3537239473.327508
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28297915786.620064
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.22102221153008672
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 268435456
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 7
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.019093514285714284
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013031432629676501
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.01906842068263463
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013252538341278812
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3519371903.784103
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28154975230.272823
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21990576754462027
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 1073741824
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 2
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.07657910000000001
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0765664176940918
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": null
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 3505916354.510519
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 28047330836.084152
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.21906500590543107
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U32 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 2,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U32"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^20 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1048576
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 8388608
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 270
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009398844444444449
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023251730900045106
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0009325108146225963
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.023261920514979124
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1124465243.252302
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 17991443892.03683
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14052302465037514
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^22 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 4194304
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 33554432
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 68
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0036767867647058824
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.04207676223538282
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.0036685091747957112
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.042174363328430726
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1143326566.7745178
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18293225068.392284
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14288010082160932
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^24 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 16777216
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 134217728
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 18
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014109038888888889
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002421042106667527
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.014098197301228839
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.002451783799133172
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1190025621.1152365
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 19040409937.843784
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.1487160236334962
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^26 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 67108864
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": {
-            "Input Buffer Size: ": {
-              "hint": {
-                "type": "string",
-                "value": "bytes"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Size"
-              },
-              "value": {
-                "type": "int64",
-                "value": 536870912
-              }
-            },
-            "Number of Samples (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "sample_size"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Samples"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of kernel executions in cold time measurements."
-              },
-              "value": {
-                "type": "int64",
-                "value": 5
-              }
-            },
-            "Average CPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "CPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time observed from host."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.056853980000000005
-              }
-            },
-            "CPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold CPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013346033114121256
-              }
-            },
-            "Average GPU Time (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "duration"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GPU Time"
-              },
-              "description": {
-                "type": "string",
-                "value": "Average isolated kernel execution time as measured by CUDA events."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.05684054412841797
-              }
-            },
-            "GPU Relative Standard Deviation (Cold)": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Noise"
-              },
-              "description": {
-                "type": "string",
-                "value": "Relative standard deviation of the cold GPU execution time measurements."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.013346516895045969
-              }
-            },
-            "Element Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "item_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "Elem/s"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of input elements handled per second."
-              },
-              "value": {
-                "type": "float64",
-                "value": 1180651329.5928898
-              }
-            },
-            "Average Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "byte_rate"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "GlobalMem BW"
-              },
-              "description": {
-                "type": "string",
-                "value": "Number of bytes read/written per second to the CUDA device's global memory."
-              },
-              "value": {
-                "type": "float64",
-                "value": 18890421273.486237
-              }
-            },
-            "Percent Peak Global Memory Throughput": {
-              "hint": {
-                "type": "string",
-                "value": "percentage"
-              },
-              "short_name": {
-                "type": "string",
-                "value": "BWPeak"
-              },
-              "description": {
-                "type": "string",
-                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
-              },
-              "value": {
-                "type": "float64",
-                "value": 0.14754453006659457
-              }
-            }
-          },
-          "is_skipped": false
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^28 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 268435456
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
-        },
-        "Device=0 Key=U64 Input=Rand Pattern=Ascend Elements=2^30 Bits=Half": {
-          "device": 0,
-          "type_config_index": 3,
-          "min_samples": 10,
-          "min_time": 0.5,
-          "max_noise": 0.005,
-          "skip_time": -1.0,
-          "timeout": 0.5,
-          "axis_values": {
-            "Key": {
-              "type": "string",
-              "value": "U64"
-            },
-            "Input": {
-              "type": "string",
-              "value": "Rand"
-            },
-            "Pattern": {
-              "type": "string",
-              "value": "Ascend"
-            },
-            "Elements": {
-              "type": "int64",
-              "value": 1073741824
-            },
-            "Bits": {
-              "type": "string",
-              "value": "Half"
-            }
-          },
-          "summaries": null,
-          "is_skipped": true,
-          "skip_reason": "Unexpected error: bad allocation: cudaErrorMemoryAllocation: out of memory"
+          "skip_reason": "Not a conversion: InputType == OutputType."
         }
       }
     }