mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
* Add cuda architectures to build wheel for * Package scripts in wheel * Separate cuda major version extraction to fix architecutre selection logic * Add back statement printing cuda version * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
17525 lines
529 KiB
JSON
17525 lines
529 KiB
JSON
{
|
|
"meta": {
|
|
"argv": [
|
|
"bin/nvbench.example.axes",
|
|
"--json",
|
|
"/home/av/code/src/nvbench/scripts/test_ref.json"
|
|
],
|
|
"version": {
|
|
"json": {
|
|
"major": 1,
|
|
"minor": 0,
|
|
"patch": 0,
|
|
"string": "1.0.0"
|
|
},
|
|
"nvbench": {
|
|
"major": 0,
|
|
"minor": 1,
|
|
"patch": 0,
|
|
"string": "0.1.0",
|
|
"git_branch": "walltime_reports",
|
|
"git_sha": "348acbd6eb752a87e15c28fe1ad1cb827eaaadec",
|
|
"git_version": "old-cmake-63-g348acbd",
|
|
"git_is_dirty": false
|
|
}
|
|
}
|
|
},
|
|
"devices": [
|
|
{
|
|
"id": 0,
|
|
"name": "Quadro GV100",
|
|
"sm_version": 700,
|
|
"ptx_version": 700,
|
|
"sm_default_clock_rate": 1627000000,
|
|
"number_of_sms": 80,
|
|
"max_blocks_per_sm": 32,
|
|
"max_threads_per_sm": 2048,
|
|
"max_threads_per_block": 1024,
|
|
"registers_per_sm": 65536,
|
|
"registers_per_block": 65536,
|
|
"global_memory_size": 34086060032,
|
|
"global_memory_bus_peak_clock_rate": 850000000,
|
|
"global_memory_bus_width": 4096,
|
|
"global_memory_bus_bandwidth": 870400000000,
|
|
"l2_cache_size": 6291456,
|
|
"shared_memory_per_sm": 98304,
|
|
"shared_memory_per_block": 49152,
|
|
"ecc_state": false
|
|
},
|
|
{
|
|
"id": 1,
|
|
"name": "Quadro GP100",
|
|
"sm_version": 600,
|
|
"ptx_version": 600,
|
|
"sm_default_clock_rate": 1442500000,
|
|
"number_of_sms": 56,
|
|
"max_blocks_per_sm": 32,
|
|
"max_threads_per_sm": 2048,
|
|
"max_threads_per_block": 1024,
|
|
"registers_per_sm": 65536,
|
|
"registers_per_block": 65536,
|
|
"global_memory_size": 17069309952,
|
|
"global_memory_bus_peak_clock_rate": 715000000,
|
|
"global_memory_bus_width": 4096,
|
|
"global_memory_bus_bandwidth": 732160000000,
|
|
"l2_cache_size": 4194304,
|
|
"shared_memory_per_sm": 65536,
|
|
"shared_memory_per_block": 49152,
|
|
"ecc_state": false
|
|
}
|
|
],
|
|
"benchmarks": [
|
|
{
|
|
"name": "simple",
|
|
"index": 0,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": null,
|
|
"states": [
|
|
{
|
|
"name": "Device=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": null,
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010094458717434867"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005997663682735138"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010034715849794225"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005782350585973689"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51435071"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001001475909284053"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524782268"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": null,
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010075622164328662"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004836642334083953"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010027443022431728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00034308545348455907"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.512193993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010014738126565483"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5247834060000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "single_float64_axis",
|
|
"index": 1,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "0",
|
|
"description": "",
|
|
"value": 0.0
|
|
},
|
|
{
|
|
"input_string": "0.0001",
|
|
"description": "",
|
|
"value": 0.0001
|
|
},
|
|
{
|
|
"input_string": "0.0002",
|
|
"description": "",
|
|
"value": 0.0002
|
|
},
|
|
{
|
|
"input_string": "0.0003",
|
|
"description": "",
|
|
"value": 0.00030000000000000003
|
|
},
|
|
{
|
|
"input_string": "0.0004",
|
|
"description": "",
|
|
"value": 0.0004
|
|
},
|
|
{
|
|
"input_string": "0.0005",
|
|
"description": "",
|
|
"value": 0.0005
|
|
},
|
|
{
|
|
"input_string": "0.0006",
|
|
"description": "",
|
|
"value": 0.0006000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0007",
|
|
"description": "",
|
|
"value": 0.0007000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0008",
|
|
"description": "",
|
|
"value": 0.0008000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0009",
|
|
"description": "",
|
|
"value": 0.0009000000000000002
|
|
},
|
|
{
|
|
"input_string": "0.001",
|
|
"description": "",
|
|
"value": 0.0010000000000000002
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 Duration=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "127488"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "9.540251349146535e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4435508787705211"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "3.9224058844425625e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.14064817853323436"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "11.490547931"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "274905"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.8188127571551e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.500083096"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "4853"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010853461796826674"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.05359830602702947"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010302987600936478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00484111901842999"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.637141422"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "5092"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0001013762061275653"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51621627"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0002",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2459"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020891169174461132"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02717422799526722"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020340182381027155"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002406936807045068"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5674029660000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020172880307174672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520873229"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0003",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.00030000000000000003"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1652"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003081868111380144"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.018240770684480382"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030268341853095175"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0016523707958282026"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5447823540000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030105657621462773"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5226434630000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0004",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0004"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1241"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040852977276389983"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.013603343023457075"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040306361880540617"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0012210042127847492"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.53335829"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1304"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040140879812416123"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5234471270000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0005",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0005"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "994"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005089016619718308"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010853962612041912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000503456178265558"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009750606561696034"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5268077360000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005017619516657686"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523849472"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0006",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0006000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "830"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006082555698795184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009191209785025295"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000602735921345563"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008234812151490051"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522369137"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "872"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006010903174724053"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524159087"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0007",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0007000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000708571620786517"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007823433090894212"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007030903266721907"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007055254847806133"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5193877680000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "748"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007014426981064088"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524686893"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0008",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0008000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "623"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008089194157303374"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006828496858360085"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008034522826177895"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000611164680542835"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.516959448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "654"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008017951428707951"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524383518"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0009",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0009000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "554"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009082872328519855"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006126265423787953"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009027800905360124"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00054941989913754"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.514815663"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009011235712320125"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524463788"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0010000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010086229759519048"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005485055388542774"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010031437666000492"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004923631784045008"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.513707802"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001001475909284053"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5247822560000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "153037"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.764162771094724e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.5441551718680286"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "3.05725036652246e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0422080578285922"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "15.000158798000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "369923"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.3516386844065532e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5000379110000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "4880"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010714987602459019"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.04579843914769717"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010247656405124585"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.003070733813086406"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.623587079"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "5111"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010137617021268466"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5181448860000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0002",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2466"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002074915798864561"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.023038028738255983"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020283785226716245"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015459112691612667"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5598459640000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2588"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002017284788341021"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522084334"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0003",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.00030000000000000003"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1655"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030685509425981873"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.015537955668238047"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030217996281079384"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010230869145796749"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.539645641"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1737"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003010563825696243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522946315"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0004",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0004"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004072019324215605"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011695107558689763"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004025163378863194"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007675334678184685"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.529549118"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1304"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004014085172875527"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523448241"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0005",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0005"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "995"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005075862180904529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009382010906172408"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005028912236343076"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000614146973517185"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523755894"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005017611916494552"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523849106"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0006",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0006000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "831"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000606902394705175"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007845243707907233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006022033425301283"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005063896891280906"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.519957368"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "872"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006010888475890554"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524160638"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0007",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0007000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007072048665730335"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0066432216884127464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000702561125326692"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000444186835693086"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.516785433"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "747"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007014415899274179"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523988754"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0008",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0008000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "623"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008075409711075438"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005835313968640737"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008028804110677048"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038458153244696385"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.514684533"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "654"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008017937429818903"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524384532"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0009",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0009000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009069636108108111"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00524675883682327"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009022562016237966"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00035453453918318805"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.513753633"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "583"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009011228374919596"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525365383"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0010000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010072655711422854"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004687180507983469"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010025901990328623"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003187607548154873"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511897986"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010014750939289121"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524783733"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_sweep_grid_shape",
|
|
"index": 2,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "6",
|
|
"description": "2^6 = 64",
|
|
"value": 64
|
|
},
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256
|
|
},
|
|
{
|
|
"input_string": "10",
|
|
"description": "2^10 = 1024",
|
|
"value": 1024
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "6",
|
|
"description": "2^6 = 64",
|
|
"value": 64
|
|
},
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256
|
|
},
|
|
{
|
|
"input_string": "10",
|
|
"description": "2^10 = 1024",
|
|
"value": 1024
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "78"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006491010679487182"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014400823428293225"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006485689823444072"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001177496193520018"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "10347220700.783287"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "82777765606.2663"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.09510313144102286"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.507885141"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "81"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006479606722608024"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524857216"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002171159740853659"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008002387413205372"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0021657661977337647"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007621859662189712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30986199743.177273"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "247889597945.41818"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2847996299924382"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.437881264"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "657"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0021637841704410677"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.425925528"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "752"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010918482712765959"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012885667055169438"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010864888095158216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011885928967750255"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "61766732811.45538"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "494133862491.64307"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5677089412817591"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.8365534670000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "753"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010831134183156693"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.8183730920000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "231"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002170435731601731"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004358290289328953"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0021650726464919703"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.003568770929248165"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30996125746.05075"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "247969005968.406"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.28489086163649585"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.50610783"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0021624730742026746"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5254883690000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "848"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001072975840801887"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010997227168594192"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001067600981103923"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009773228014049224"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62859500120.174065"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "502876000961.3925"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5777527584574822"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9275834220000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "849"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010646783151390692"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.907651745"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009655372026098907"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007865605432218092"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009601815831693899"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005554818166025829"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "69891846684.33807"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "559134773474.7046"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6423882967310485"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4363716210000002"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1457"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009584971580017669"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4059789770000002"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "976"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010651546700819676"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010462843606792348"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010597991125016906"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009173166661501059"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63322249668.22941"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "506577997345.83527"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5820059712153438"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.059860389"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "977"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010574653456130558"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.038207793"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1231"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009616476466287569"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007525336475207418"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009562607302014941"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004999666932127862"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "70178416702.1681"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "561427333617.3448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6450222123361039"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.209430215"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1232"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009541545041486059"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.182675849"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "496"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001023795669354839"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.03114530461728092"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010184043220454653"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0306728390106973"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "65896091117.53555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "527168728940.2844"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6056626021832312"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.518169856"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "542"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010072569899893336"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.54594076"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2244"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0066659496501782385"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012044246591117944"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0066612275798478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012013908599240357"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "10074549052.04325"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "80596392416.346"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.11008029995676627"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "15.003487063000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2245"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006649818384514629"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "14.950437548000002"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "218"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022997498486238524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0031075885812940247"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022950336933135985"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002319295976145478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "29240905785.18147"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "233927246281.45175"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.319502904121301"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.505458689"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "228"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022943040278919956"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5231123440000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "426"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011787892863849767"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0053738632436882575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011741032116289985"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.003574335492607712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "57157550831.40471"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "457260406651.2377"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6245361760424466"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.510194696"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011726912089756558"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525375562"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "226"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002220062486725664"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0026283185437807914"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002215349671060005"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015099121235202378"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30292673376.42893"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "242341387011.43143"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.33099511993475667"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5059807590000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "237"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002214404399887922"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524823987"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "544"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001132157450367647"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007421825838534079"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011274838826673863"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006150104794311432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59520907599.348335"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "476167260794.7867"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6503595673005719"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.626143416"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "545"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011260322986392797"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6144447430000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "447"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011234373914988803"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0046157037787769705"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011187847153985792"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0019678845031191957"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59983715433.66298"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "479869723469.30383"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6554164710846042"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5105567950000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "471"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001116505875962049"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5258857730000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011217261607142856"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0051297852216839"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011170590700847755"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002961082814438008"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "60076378946.466095"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "480611031571.72876"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6564289657612117"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.510912252"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "471"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011151960174495754"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525268305"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "447"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011251578970917226"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005034162791339124"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011205025481964396"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0028024348992886213"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59891754916.5938"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "479134039332.7504"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6544116577425022"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511320454"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "469"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011173903865854878"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524067611"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "474"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010600141455696206"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004975251378988354"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001055306057638257"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002145050602183929"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63591849505.90315"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "508734796047.2252"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6948410129578578"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51139696"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001053948106173284"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5259308100000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_type_sweep",
|
|
"index": 3,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "T",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "U8",
|
|
"description": "uint8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U16",
|
|
"description": "uint16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U32",
|
|
"description": "uint32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U64",
|
|
"description": "uint64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 T=U8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U8"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "3008"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022984299517952063"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.024386082027668385"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002292998504448446"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02420260340376021"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "117067436144.95683"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "234134872289.91367"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2689968661419045"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "6.978260496000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "3009"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022799289537926114"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "6.889595338"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "352"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001447471568181817"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006559305677944603"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014420469982380224"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0053741666978658"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "93074447756.55377"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "372297791026.2151"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.4277318371165155"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51661557"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "364"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014379604465358862"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5234251630000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "960"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010732170854166677"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010747135588535793"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010678389670948187"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009483040603054523"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62845490816.44543"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "502763926531.5634"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5776239964746822"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.050184244"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "961"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010641954020828663"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.027618288"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1232"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009393459350649342"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008173389439931525"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009339579984352195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005779340051383488"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35927131687.09736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "574834106993.5577"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6604252148363485"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.1830953560000002"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009315095380951709"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.155529067"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "496"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001073113616935484"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01045196550212673"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010676812894882687"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009124433511281603"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62854771981.7819"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "502838175854.2552"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5777093013031425"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.542738971"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "497"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010637711835818988"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528728784"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1232"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009395226306818184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008519805595594534"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009341417393804389"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006275440558350422"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35920065002.399605"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "574721040038.3937"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6602953125441103"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.183265443"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009317267593676144"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.155949535"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U8"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2640"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002704848576515149"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008230990844947604"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002700116645206098"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008022518759096101"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "99416244285.81326"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "198832488571.62653"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2715697232457749"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.193921508000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2641"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002695621145895508"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.143042223"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "330"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015221530787878797"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0057375488336673195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015174821813901261"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004846658675573777"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "88447646796.77927"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "353790587187.11707"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.48321485356632027"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.508487203"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001516773410764139"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5263325360000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "704"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011323334801136371"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007608036417757464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001127679999409751"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0063837058695073836"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59510556217.30105"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "476084449738.4084"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6502464621645656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.810527479"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "705"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011263038268326028"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.796312915"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010527462217573217"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005281184054309557"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010480330030289645"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002728222354093458"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32016579538.070763"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512265272609.1322"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.699663014380917"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51213991"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "500"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00104523095703125"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522627688"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011329571594827575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00779514172858376"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011281946900075874"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006520518090102098"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59483407070.05869"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "475867256560.46954"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6499498150137532"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.534519304"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "467"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011258338421774624"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5257752330000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010524297447698746"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0053011542236172425"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010477156826142983"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0027421021970940066"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32026276361.802433"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512420421788.8389"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6998749204939343"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511908565"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "501"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010452877204575224"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5236998340000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_type_conversion_sweep",
|
|
"index": 4,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "In",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "I8",
|
|
"description": "int8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I16",
|
|
"description": "int16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I32",
|
|
"description": "int32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I64",
|
|
"description": "int64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "I8",
|
|
"description": "int8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I16",
|
|
"description": "int16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I32",
|
|
"description": "int32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I64",
|
|
"description": "int64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 In=I8 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "992"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006600980292338716"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.06265755233269708"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006543757735841723"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.061480066936899634"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "102554016681.31558"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "307662050043.9468"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.35347202440710795"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6754950980000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006166902596256644"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.613870906"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "684"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007370927309941522"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008729576791697422"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007317126547558279"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004720480993831976"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "91714778422.67767"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "458573892113.3883"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5268541959023303"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5183597600000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "723"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007290440898383471"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5271070320000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "680"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007416632955882347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00842788883853582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007362919512917023"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004273389706237406"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "91144367234.04161"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "455721836170.20807"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5235774772176104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.518484646"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007338700883843925"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522523862"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0012047073446969693"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009645329133519535"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011993323018153505"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0085331592060071"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "55955187647.6784"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "503596688829.1056"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5785807546290276"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.647215045"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001196625677083526"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6339911500000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1200"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001180585506666666"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010080936476778664"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001175182694693406"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008979393908657816"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "57105047838.97287"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "513945430550.75586"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5904703935555559"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4422983710000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1201"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011731425545594744"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4168893610000002"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 6,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 7,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 8,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1696"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00043055614622641435"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01653471806668262"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00042517247028157185"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010615851149343741"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78919578160.31331"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "473517468961.8799"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5440228273918657"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.766234603"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1697"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004232069438320117"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.7227836270000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 9,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00043288785641891876"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01493008601829662"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004275269453740999"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008146558252326382"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78484952499.63434"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "470909714997.8061"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5410267865324059"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.537378945"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1238"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000424921975574894"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.526063107"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 10,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "768"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006571356510416664"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010841823646108464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006517510409466911"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007001797627972981"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "51483511175.15826"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "514835111751.5826"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5914925456704763"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5206853530000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "811"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006489950196516646"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.526342823"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 11,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "768"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006567598033854167"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010681410148370487"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006514065422428152"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006845472386064585"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "51510738416.09102"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "515107384160.9102"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5918053586407517"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5204057160000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "805"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006483509893002717"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.521930972"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 12,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 13,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 14,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 15,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1904"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002683022746848742"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0242735351289231"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026296673859117433"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.013347568443463995"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63799764524.90815"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "510398116199.2652"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5863948945304058"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.550977791"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2015"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002597611117303815"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523427203"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 16,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000382968452560241"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.016207309597693925"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037759019212281323"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0077904180984165565"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44432340537.44468"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533188086449.3362"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6125782243213881"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5365996590000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1396"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003753899079680784"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524052128"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 17,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038311062575301184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.016679525157534167"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037773971044155724"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00868318074979036"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44414753165.31679"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "532977037983.8014"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6123357513600659"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.53682523"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1367"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003756192367096059"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.513480453"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 18,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 19,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 20,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1904"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026897186554621826"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02368215369603371"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026361388154327934"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012168663554835448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63643143152.328896"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "509145145218.63116"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5849553598559641"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5524873680000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1991"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026033155670242655"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5183284650000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 21,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 22,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003830804947289157"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01669586098492127"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003777089391846278"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008791389028513721"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44418371554.07946"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533020458648.95355"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6123856372345514"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.536636844"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1404"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003754535468555244"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.527143864"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 23,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003830802665662652"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01641959194869226"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037772002358393717"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008238450770780798"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44417068072.833466"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533004816874.0016"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6123676664453144"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.536685893"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1389"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000375325125357159"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.521334187"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 24,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 25,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 26,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 27,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 28,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 29,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2112"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002422139554924242"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02400462331990132"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023683451699572973"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007866889921134117"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35419701935.386604"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "566715230966.1857"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6510974620475479"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.556787641"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2225"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023412507132198032"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5209357530000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 30,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 31,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 32,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 33,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 34,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2112"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00024282649337121185"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.024008982414037136"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002374703656091845"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008267454870626736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35324862445.386154"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "565197799126.1785"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6493540890695985"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5575585390000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2214"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023455057945354847"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.519302479"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 35,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "992"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006824859284274195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.032440596768964644"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006776485806030618"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.03134186379501587"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "99031955383.5376"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "297095866150.6128"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.40577997452826264"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.696101135"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006594375443362513"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6568292870000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "592"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008640237381756752"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01007729538332117"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008593635128156565"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008503073738172521"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78091358312.52779"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "390456791562.639"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5332943503641813"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522774643"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "615"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008578066445947664"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.527563349"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008614595929878056"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0098819558217369"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008567909279429336"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008239605399217536"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78325834006.11096"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "391629170030.5548"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5348956102908583"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.577643271"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "657"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008551398322462489"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.562343075"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014573860359848496"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006512604739977204"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014527478784774298"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00567750642698368"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "46194432629.517426"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "415749893665.65686"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5678402175284868"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.779591082"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014509199143357438"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.7691349590000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "352"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014603711335227268"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006314392964515164"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014557048187337147"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005429185609780399"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "46100598923.87834"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "414905390314.905"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5666867765446146"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520741476"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "363"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014527880455836777"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.527373104"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 6,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 7,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 8,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004611589565217395"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012940549047826096"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004564886665700571"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007894471093785426"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "73505509462.30429"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "441033056773.82574"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6023725097981667"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.530236417"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1149"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00045453528926308207"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5222721800000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 9,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004598074438405802"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012817089346835498"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004551599129116618"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007723313560215716"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "73720094955.97276"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "442320569735.83655"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6041310229128012"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528748192"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1166"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00045311976542399224"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5283480380000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 10,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007533759211309518"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00846442255799797"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007486925714959693"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005711239714249503"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44817370009.36792"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "448173700093.6792"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6121253552415854"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.518973692"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "701"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007453038627853066"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5224711350000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 11,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007515454092261914"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008392859436579346"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000746849381497928"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005530757922319482"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44927977221.72726"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "449279772217.2726"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6136360525257766"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.517848211"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "704"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007438007701526989"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523646363"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 12,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 13,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 14,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 15,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1840"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00027774970760869537"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01802281728547624"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00027309293929973365"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0057206437926147526"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "61434089226.254715"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "491472713810.0377"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6712640868253356"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.547010069"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1923"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002714794236300702"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522066944"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 16,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00042314036485355624"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012144571947569476"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041846681174872806"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00470128993484091"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40092106539.79896"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481105278477.5875"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6571040188996771"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5286744840000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1263"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041609304251410327"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525537319"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 17,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004230686694560669"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012123446476677473"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000418410443611225"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004701276954550611"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40097507737.136955"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481170092845.6435"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6571925437686346"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528583919"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1258"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004162648114566773"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523674003"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 18,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 19,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 20,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1808"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00028176399834070837"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.021188068496395096"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002770714691914288"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012675642708623109"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "60551943687.88875"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "484415549503.11"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6616252588274557"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.544610159"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1911"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00027552812178056315"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.526544561"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 21,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 22,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00042314520585774067"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0121383962955979"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004184789956862957"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004742822949323976"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40090939265.62733"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481091271187.5279"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6570848874392591"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528606066"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1257"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041616969385503683"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523135939"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 23,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004232182125523013"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012193893187009835"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041854015763334633"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004766137974469347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40085080712.12928"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481020968545.5514"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6569888665668042"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528650917"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1253"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004162007763399092"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5215109210000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 24,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 25,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 26,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 27,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 28,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 29,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1909"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002666127674174964"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.018360326425103806"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026193604216705505"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004085453563964175"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32025405631.84502"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512406490109.5203"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6998558923043056"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.546196184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1976"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002601132721070819"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5139936970000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 30,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 31,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 32,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 33,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 34,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1910"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002665624010471204"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.018356743233435932"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026189029580323475"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004090112866661142"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32030999752.287834"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512495996036.60535"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6999781414398565"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.546475312"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2007"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026003379042491905"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.521898494"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 35,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|