mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
* Add cuda architectures to build wheel for * Package scripts in wheel * Separate cuda major version extraction to fix architecutre selection logic * Add back statement printing cuda version * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
17525 lines
529 KiB
JSON
17525 lines
529 KiB
JSON
{
|
|
"meta": {
|
|
"argv": [
|
|
"bin/nvbench.example.axes",
|
|
"--json",
|
|
"/home/av/code/src/nvbench/scripts/test_cmp.json"
|
|
],
|
|
"version": {
|
|
"json": {
|
|
"major": 1,
|
|
"minor": 0,
|
|
"patch": 0,
|
|
"string": "1.0.0"
|
|
},
|
|
"nvbench": {
|
|
"major": 0,
|
|
"minor": 1,
|
|
"patch": 0,
|
|
"string": "0.1.0",
|
|
"git_branch": "walltime_reports",
|
|
"git_sha": "348acbd6eb752a87e15c28fe1ad1cb827eaaadec",
|
|
"git_version": "old-cmake-63-g348acbd",
|
|
"git_is_dirty": false
|
|
}
|
|
}
|
|
},
|
|
"devices": [
|
|
{
|
|
"id": 0,
|
|
"name": "Quadro GV100",
|
|
"sm_version": 700,
|
|
"ptx_version": 700,
|
|
"sm_default_clock_rate": 1627000000,
|
|
"number_of_sms": 80,
|
|
"max_blocks_per_sm": 32,
|
|
"max_threads_per_sm": 2048,
|
|
"max_threads_per_block": 1024,
|
|
"registers_per_sm": 65536,
|
|
"registers_per_block": 65536,
|
|
"global_memory_size": 34086060032,
|
|
"global_memory_bus_peak_clock_rate": 850000000,
|
|
"global_memory_bus_width": 4096,
|
|
"global_memory_bus_bandwidth": 870400000000,
|
|
"l2_cache_size": 6291456,
|
|
"shared_memory_per_sm": 98304,
|
|
"shared_memory_per_block": 49152,
|
|
"ecc_state": false
|
|
},
|
|
{
|
|
"id": 1,
|
|
"name": "Quadro GP100",
|
|
"sm_version": 600,
|
|
"ptx_version": 600,
|
|
"sm_default_clock_rate": 1442500000,
|
|
"number_of_sms": 56,
|
|
"max_blocks_per_sm": 32,
|
|
"max_threads_per_sm": 2048,
|
|
"max_threads_per_block": 1024,
|
|
"registers_per_sm": 65536,
|
|
"registers_per_block": 65536,
|
|
"global_memory_size": 17069309952,
|
|
"global_memory_bus_peak_clock_rate": 715000000,
|
|
"global_memory_bus_width": 4096,
|
|
"global_memory_bus_bandwidth": 732160000000,
|
|
"l2_cache_size": 4194304,
|
|
"shared_memory_per_sm": 65536,
|
|
"shared_memory_per_block": 49152,
|
|
"ecc_state": false
|
|
}
|
|
],
|
|
"benchmarks": [
|
|
{
|
|
"name": "simple",
|
|
"index": 0,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": null,
|
|
"states": [
|
|
{
|
|
"name": "Device=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": null,
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001009524801603207"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006144561739025865"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010034006580799991"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005237510233783218"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.514396598"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001001475909284053"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524788153"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": null,
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010077174468937882"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00494341955894122"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010027929121602258"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00033287816568109313"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5123603010000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010014740456151597"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524795703"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "single_float64_axis",
|
|
"index": 1,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "0",
|
|
"description": "",
|
|
"value": 0.0
|
|
},
|
|
{
|
|
"input_string": "0.0001",
|
|
"description": "",
|
|
"value": 0.0001
|
|
},
|
|
{
|
|
"input_string": "0.0002",
|
|
"description": "",
|
|
"value": 0.0002
|
|
},
|
|
{
|
|
"input_string": "0.0003",
|
|
"description": "",
|
|
"value": 0.00030000000000000003
|
|
},
|
|
{
|
|
"input_string": "0.0004",
|
|
"description": "",
|
|
"value": 0.0004
|
|
},
|
|
{
|
|
"input_string": "0.0005",
|
|
"description": "",
|
|
"value": 0.0005
|
|
},
|
|
{
|
|
"input_string": "0.0006",
|
|
"description": "",
|
|
"value": 0.0006000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0007",
|
|
"description": "",
|
|
"value": 0.0007000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0008",
|
|
"description": "",
|
|
"value": 0.0008000000000000001
|
|
},
|
|
{
|
|
"input_string": "0.0009",
|
|
"description": "",
|
|
"value": 0.0009000000000000002
|
|
},
|
|
{
|
|
"input_string": "0.001",
|
|
"description": "",
|
|
"value": 0.0010000000000000002
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 Duration=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "127632"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "9.535606282123409e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4448218958078975"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "3.918024581663389e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.14066541529910018"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "11.513563003000002"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "274328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.8226457245237315e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.500101118"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "4853"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010851134411704107"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.053377272961503276"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010302993536069301"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004807683479660842"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6373502280000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "5088"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010137620362095862"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51581551"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0002",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2459"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002088847271248475"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.027095357105136896"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020339123081777852"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002426402384835198"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5670174410000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020172880307174672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520878249"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0003",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.00030000000000000003"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1652"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003082859001210656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01842186373388549"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003027270989578126"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0016270299573856555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.544737606"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030105657621462773"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522648918"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0004",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0004"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1241"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040859692667203864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.013800282471048258"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004030542842665574"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0012342926945401174"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.533285391"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1304"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040140879812416123"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5234506480000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0005",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0005"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "994"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005090076327967808"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01104211789520747"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005034694101968762"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000960945456149481"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.526845475"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005017609577982818"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5238518600000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0006",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0006000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "830"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006083229987951809"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009259805546541143"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006027641820620359"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008125705181484989"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52231507"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "872"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006010903174724053"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5241642790000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0007",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0007000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007086338553370777"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007928264539185437"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007030805292424196"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007112507950799924"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.519468829"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "748"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007014426981064088"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52469385"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0008",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0008000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "623"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008089985730337072"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006971030802740222"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008034196651957732"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006306208005906063"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5170688250000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "654"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008017951428707951"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5243872230000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.0009",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0009000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "554"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009083576299638984"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006199510137107782"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009027842496276245"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005444417680564487"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.514841552"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009011235712320125"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524466611"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 Duration=0.001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0010000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010087251282565122"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005573661860035435"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010031565917517711"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004852012011897464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5138195830000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010014756763254413"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524785882"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "153013"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.705666139478051e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.5262458153177543"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "3.057407826310601e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.045574170376734044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "15.000211589000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "369906"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.3516989302429717e-06"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.500042922"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "4879"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010713845111703245"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.045460323768744995"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010249834163043719"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0030010311127595573"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6230727620000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "5081"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00010137619922490036"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5151083350000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0002",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2465"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020751516592292123"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.023007065837400455"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020286964052951872"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014997658908938753"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.559679316"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2588"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00020172862033755555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522088477"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0003",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.00030000000000000003"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1655"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003068471528700908"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.015443555151131"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00030220268294890517"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010498159491600372"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.539562934"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003010567520071284"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52264897"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0004",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0004"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040717730973451277"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011630958382375049"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00040252058700697966"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007711533484593173"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5294895540000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1305"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004014086726981561"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52385337"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0005",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0005"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "995"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005075514221105535"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009291726931158024"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005029017407690461"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006019586171273846"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523705419"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005017608408726951"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5238529980000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0006",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0006000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "831"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006068636666666669"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00775159368655319"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000602217434115358"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0005211064062823375"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5198631410000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "873"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006010892503176905"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524771732"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0007",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0007000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "712"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007072028300561799"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006649464561878749"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007025522259848826"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004304629385174026"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.516796464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "748"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007014422085195939"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524693347"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0008",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0008000000000000001"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "623"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008076071910112361"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005864235047342223"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008029232501600935"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000386286201448909"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.514722272"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "655"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008017945267771947"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5251914990000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.0009",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0009000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009069257099099103"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005199849951312571"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009022579880448067"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000339409683584611"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.513695142"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00090112220790378"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5244711550000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 Duration=0.001",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"value": "0.0010000000000000002"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "499"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010072258977955914"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004633193202486146"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010026042473340073"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003115372302150914"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511907711"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010014748609703007"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524787242"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_sweep_grid_shape",
|
|
"index": 2,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "6",
|
|
"description": "2^6 = 64",
|
|
"value": 64
|
|
},
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256
|
|
},
|
|
{
|
|
"input_string": "10",
|
|
"description": "2^10 = 1024",
|
|
"value": 1024
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "6",
|
|
"description": "2^6 = 64",
|
|
"value": 64
|
|
},
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256
|
|
},
|
|
{
|
|
"input_string": "10",
|
|
"description": "2^10 = 1024",
|
|
"value": 1024
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "78"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00648948455128205"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015111507522308748"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006484057010748448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0012531664584969381"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "10349826333.845528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "82798610670.76422"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.09512708027431552"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5077619640000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "81"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006481402361834491"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5250069540000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00217197076636905"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0074534188597851336"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002166515097376846"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00701989634431853"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30975488738.229183"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "247803909905.83347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2847011832557829"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.473432187"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "673"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002163565506021122"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4606610070000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "688"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010916693808139535"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01306842599006877"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001086250233269015"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012078568140597113"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "61780298815.71512"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "494242390525.72095"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.567833628820911"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.765170478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "689"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010836307621832676"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.748836308"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "231"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002171097186147186"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00443956157556455"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0021655962921324217"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.003655102168422409"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30988630819.05223"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "247909046552.41785"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.28482197443981827"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.506240788"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002161031840760031"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525142297"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010725499320652177"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011413433377036444"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001067108783223058"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010207282915832727"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62888493708.49215"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "503107949667.9372"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5780192436442293"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.804676228"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "737"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010644761438770877"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.787106834"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1488"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009658611908602143"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007916411658808452"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009604295065966908"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005543866520742756"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "69873804937.337"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "558990439498.696"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.642222471850524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.468436431"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1489"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009584573153443874"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4370937890000002"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^6 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010655318598484856"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010411330423168705"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010600458776408978"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009061611791593436"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63307509057.38994"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "506460072459.1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.581870487659834"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.573568937"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010571805049431119"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.559696812"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^8 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1032"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009617264147286825"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007568406287684157"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009562815504018629"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0049994946059467"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "70176888774.8577"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "561415110198.8616"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6450081688865598"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.013943156"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1033"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009539899551395297"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9906108020000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^10 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "560"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010253841303571433"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.03170958999602246"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010199987426400187"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.031247624116965786"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "65793085025.09035"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "526344680200.7228"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6047158550100216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5859060660000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "561"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010093532926046065"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5665410750000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2245"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0066631781487750605"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010029284827333777"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006658390919190473"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009997863006145854"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "10078841091.558964"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "80630728732.47171"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.11012719724168449"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "15.004037418000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2246"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006649344001406553"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "14.956094204000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "218"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002299290371559632"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0030536390935653273"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002294595665887955"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002251444609090054"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "29246487735.359"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "233971901882.872"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.3195638957097793"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5053657460000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "228"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002293837965580455"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523009414"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^6",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "426"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001178968861502347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005201397517740588"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011742734310212829"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0033150798656458847"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "57149265432.69776"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "457194123461.5821"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6244456450251067"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5103190240000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "450"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011726047092013889"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.527687784"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "226"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002219887185840708"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0026061881909203283"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002215178051881032"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015054811845863602"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "30295020277.49602"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "242360162219.96817"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.33102076352159115"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.506002865"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "237"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022142488503757913"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524792129"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011318572321428575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007609392746712896"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011271811462938788"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0063649890038617206"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59536893622.33475"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "476295148978.678"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6505342397545317"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5155772900000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "470"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011258403372257314"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.529160021"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "256"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "447"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011232368366890376"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004611911863103576"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001118592285736562"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0019864118812352185"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59994034337.37313"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "479952274698.98505"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.655529221343675"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.510514425"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "470"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011164527406083776"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52474862"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^6 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "64"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011216608169642855"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005148879095566737"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011169912165829119"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002975557170232136"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "60080028386.70366"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "480640227093.6293"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6564688416379333"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.510971747"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "470"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011154764378324467"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524288288"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^8 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "447"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011246830559284123"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004922192808378086"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011200362225240248"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002646502354772987"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59916690773.418724"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "479333526187.3498"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6546841212130542"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511144538"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "469"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011177327820995468"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5242309390000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 BlockSize=2^10 NumBlocks=2^10",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
},
|
|
{
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "474"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010598897257383965"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004913062706223566"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001055195342387831"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0020503329663902545"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63598521813.16255"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "508788174505.3004"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6949139184130524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.511291385"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "498"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010538602162556477"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524838223"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_type_sweep",
|
|
"index": 3,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "T",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "U8",
|
|
"description": "uint8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U16",
|
|
"description": "uint16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U32",
|
|
"description": "uint32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "U64",
|
|
"description": "uint64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 T=U8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U8"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2992"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022994002396390365"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.024778400174351137"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002293938610882044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02459574709695746"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "117019459338.00893"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "234038918676.01785"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2688866253171161"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "6.944152369"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0022801307408338873"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "6.853598372"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014471324925595243"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006960808950083016"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001441753045966228"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005894275617037584"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "93093424269.51526"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "372373697078.06104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.4278190453562282"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.98617708"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "673"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001438309451800399"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9712327470000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "848"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001072196766509434"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011171612715506738"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010668103765204251"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009987563873112983"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62906084789.7697"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "503248678318.1576"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5781809263765597"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9268544190000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "849"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001064370134974818"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9073867080000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=U64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1568"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009390030325255086"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008249904375540816"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009335942644701952"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005885112868111755"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35941129114.62859"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "575058065834.0575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6606825204894962"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.505405182"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1569"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009319373003763345"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.4727063310000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "752"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010726744441489362"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01103458165791857"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010672343821918702"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009783271621840188"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "62881092588.27738"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "503048740706.21906"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5779512186422553"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.822184149"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "753"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001063365562503555"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.803427655"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 T=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "544"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009393335257352945"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007706533640467741"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000933937587282237"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005096920917422438"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35927916872.52203"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "574846669960.3525"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6604396483919491"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5221495780000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "565"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0009305206028761061"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525756552"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U8"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2784"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0027056495269396513"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009323042699490573"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0027009093115727078"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009134230706566165"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "99387067477.54266"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "198774134955.08533"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.2714900226112944"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.588833747000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2785"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002695659536947251"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "7.532905504"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "330"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015226199969696965"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005622755807814305"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015179373560529775"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004685161525974869"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "88421124537.70831"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "353684498150.83325"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.4830699548607316"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.508664443"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "349"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0015155031045733347"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528926318"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011321445473484848"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007683380682909642"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011274604856064824"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006460085786164455"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59522142777.2707"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "476177142218.1656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6503730635628354"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.607756878"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011264972348745013"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5965697010000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=U64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "U64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010525728723849374"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005381193387193611"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00104785640469156"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0028679450954259256"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32021975386.863106"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512351606189.8097"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6997809306569734"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.512086032"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "500"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010454827880859374"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5227592090000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011322722803030294"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0076927816018557355"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011275246077866272"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006425647618712464"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "59518757760.62857"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "476150062085.02856"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.650336076929945"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.608051446"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011258555051284391"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.596245555"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 T=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "T",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "478"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010523593117154819"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005286902872056256"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001047618542256216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.002706934586546566"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32029245996.099976"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512467935937.5996"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6999398163483386"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.51215675"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "503"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0010453338319691226"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525822574"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "copy_type_conversion_sweep",
|
|
"index": 4,
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"devices": [
|
|
0,
|
|
1
|
|
],
|
|
"axes": [
|
|
{
|
|
"name": "In",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "I8",
|
|
"description": "int8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I16",
|
|
"description": "int16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I32",
|
|
"description": "int32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I64",
|
|
"description": "int64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "type",
|
|
"flags": "",
|
|
"values": [
|
|
{
|
|
"input_string": "I8",
|
|
"description": "int8_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I16",
|
|
"description": "int16_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I32",
|
|
"description": "int32_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F32",
|
|
"description": "float",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "I64",
|
|
"description": "int64_t",
|
|
"is_active": true
|
|
},
|
|
{
|
|
"input_string": "F64",
|
|
"description": "double",
|
|
"is_active": true
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 In=I8 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1008"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006586167946428575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.05914716011832632"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006528769518056576"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.057982657554439924"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "102789451847.54562"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "308368355542.63684"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.35428349671718384"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6850768"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1009"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006166892571539062"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.623953807"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "684"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007371795058479537"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008847285785468822"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000731761917384746"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004879904384809398"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "91708604131.57506"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "458543020657.87524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5268187277778897"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5183809100000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "719"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000729479623935153"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5245085300000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "680"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007416148632352943"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00847915084559806"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000736235386308502"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0043064135466205815"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "91151369858.06279"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "455756849290.3139"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5236177036883202"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5183531140000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "718"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007336860167946988"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.526798324"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0012050906723484857"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.009907122479821073"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011996847262436706"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008827766250664237"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "55938750016.53507"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "503448750148.8156"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5784107883143562"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.647279072"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "529"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011969163755838723"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.634178896"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I8 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1040"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011804124500000013"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00957111143535521"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011749697549985022"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00836195198403357"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "57115396983.206215"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "514038572848.85596"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5905774044678952"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.249563157"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1041"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0011735446663800626"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.2279065690000002"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 6,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 7,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 8,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1632"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00043062034803921626"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01672483523731806"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00042520409690983404"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.010836224516018789"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78913708131.82764"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "473482248790.9659"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5439823630410913"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.737060063"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1633"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004232788786191731"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.695498727"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 9,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1184"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00043284459121621524"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.015105337896417907"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004274506211733894"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008262516586090977"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78498966518.9634"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "470993799113.7804"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5411233905259426"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5372377770000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1232"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004249965618183087"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523606656"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 10,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "768"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006572663450520837"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.011014295443548292"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006518266665128367"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007223480904816997"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "51477538007.934814"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "514775380079.34814"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5914239201279275"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520738605"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "796"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006487323243414338"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.516403752"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I16 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 11,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "880"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006565674102272736"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01097889608017816"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006511251280253577"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007150441437138621"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "51533001194.03968"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "515330011940.39685"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5920611350418162"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.596114039"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "881"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006487893111724723"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.572324006"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 12,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 13,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 14,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 15,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1904"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026858391123949583"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.024532996731547897"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002631697807648852"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0133776106644627"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63750541385.25386"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "510004331082.0309"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5859424759674068"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.551536212"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1969"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00025964095085147915"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5112442500000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 16,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038312173493975965"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0162544382499927"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037769761349422534"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00766160749599669"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44419703489.221306"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533036441870.65564"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6124040003109554"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.536583359"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1388"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003753291852879593"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520968135"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 17,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038320030346385516"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.016795599989722854"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037778021639819085"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008713885990809477"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44409990972.94271"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "532919891675.31256"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6122700961343205"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.536768873"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1377"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003755766647660222"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.517180861"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 18,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 19,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 20,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1904"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026906845745798324"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.023966161873692115"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026363512487033393"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012223432341603665"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "63638014882.31772"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "509104119058.54175"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5849082250213026"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.552411471"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1961"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002602678033419253"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5103970170000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 21,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 22,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038308867695783106"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.016918671591625058"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003776144570480286"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008723758091138187"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44429485383.46378"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533153824601.56537"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6125388609852543"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5366656240000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1396"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003754830346749642"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5241867800000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 23,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1328"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00038299202560240965"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0167760658438423"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0003775633729949433"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008684523141297206"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44435496660.91339"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "533225959930.9607"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6126217370530339"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5364044290000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1404"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00037541050924534816"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5270894850000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 24,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 25,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 26,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 27,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 28,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=0 In=I64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 29,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2112"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002423827249053035"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02416424179820878"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023696183533210337"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007872204592971034"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35400671117.538055"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "566410737880.6089"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6507476308370966"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5565369530000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2205"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023414492098922904"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5163003700000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 30,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 31,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 32,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 33,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 34,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2112"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00024286170075757575"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.02414779678250403"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023745110798909405"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008091753071026355"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "35327727341.60197"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "565243637465.6315"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6494067526029773"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5576062110000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00023462851593113247"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523938348"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=0 In=F64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 0,
|
|
"type_config_index": 35,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 1,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1024"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000683441244140624"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.03316062878230732"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0006786162495845936"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.03212477441508221"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "98890741329.99298"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "296672223989.97894"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.4052013548814179"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.719493135"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1025"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000659710607877592"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.678423381"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 2,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "592"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008640211064189187"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00944028164858259"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008593624308705327"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007736902908752538"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78091456630.25882"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "390457283151.29407"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5332950217866232"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5227107280000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "614"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008578363971523819"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5267261910000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 3,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "592"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0008612816165540544"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00975919715067052"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000856635513035832"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008096027798987914"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "78340044253.09521"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "391700221265.4761"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5349926536077853"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.521050054"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "599"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000855541095510747"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5124821390000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 4,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014581301889880955"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006071682334960142"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014534626205762236"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005154971568436681"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "46171716458.311646"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "415545448124.8048"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5675609813767548"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9926510430000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "673"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.001450536531289656"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.9791953990000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I8 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 5,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I8"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "536870912"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "352"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014604223210227273"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.006352174295896549"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014556942754848428"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005460161744719934"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "46100932819.597916"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "414908395376.3812"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5666908809227235"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520818273"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "361"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0014522860624783588"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524288878"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 6,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 7,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 8,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00046094446557971044"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012758359369013577"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004563006377252548"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007641740734756292"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "73535799045.2856"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "441214794271.71356"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6026207308125459"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.529887408"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1140"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.000454689802203262"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.518361091"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 9,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1104"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004598212318840582"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012784453789403875"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004551693620025247"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.007660841909416756"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "73718564563.25784"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "442311387379.54706"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6041184814515229"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5287007920000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1154"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004530425443599707"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5228252680000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 10,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007534447321428569"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008541470607558692"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007488120960160388"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005872606532245015"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44810216312.63993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "448102163126.3993"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6120276485008732"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5189650410000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "701"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007457322407721113"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522771435"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I16 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 11,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I16"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "33554432"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "268435456"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "672"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007513076056547618"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.008183588591017211"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007466521440517336"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005268370256387482"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "44939845505.452805"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "449398455054.528"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6137981521177448"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.517594289"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "705"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0007440871218417554"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.524597968"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 12,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 13,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 14,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 15,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1840"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002776829885869563"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.017861418449176162"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002730688870924969"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.005670388220307151"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "61439500408.250595"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "491516003266.00476"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6713232125027382"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.546509219"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1927"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002714873839983621"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523169341"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 16,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1196"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004229804180602015"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012054556683918517"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004183483349290177"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004710258623264892"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40103460679.117165"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481241528149.406"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6572901116551109"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528636114"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1252"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004159514698357628"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520784637"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 17,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004231393305439326"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012074190328924192"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004185113171653266"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004772051537598408"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40087843056.75637"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481054116681.0764"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6570341410089002"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528483731"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1258"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041619300539050074"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.523585816"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 18,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 19,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 20,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1808"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00028167357632743345"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.021202385815789038"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00027701768152151984"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012884867104306086"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "60563700872.27331"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "484509606978.18646"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6617537245659234"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5443872160000001"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1858"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002751795970970129"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.5112970170000001"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 21,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 22,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00042308463263598364"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012089671437059933"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004184455500237615"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004773399327485907"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40094143668.26772"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481129724019.21265"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.657137407150367"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528290021"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1264"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004161183321023289"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.525988793"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F32 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 23,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F32"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "16777216"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "134217728"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1195"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004230846794979085"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.012109978274913669"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0004184373557567601"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004746400070144573"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "40094928832.67498"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "481139145992.0998"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6571502758851887"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.528360885"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1255"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00041616955031436757"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.522306598"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 24,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 25,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 26,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 27,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 28,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
},
|
|
{
|
|
"name": "Device=1 In=I64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 29,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "I64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1909"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002665689759036145"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0182363750388233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026191822334444936"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.003986137271503454"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32027584384.489807"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512441350151.8369"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6999035049058088"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.545965233"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2012"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002600545409185512"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.52324384"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I8",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 30,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I8"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I16",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 31,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I16"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 32,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=F32",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 33,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F32"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=I64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 34,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "I64"
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/element_count/Items",
|
|
"name": "Items",
|
|
"description": "Number of elements: Items",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "8388608"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/reads/InSize",
|
|
"name": "InSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/gmem/writes/OutSize",
|
|
"name": "OutSize",
|
|
"hint": "bytes",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "67108864"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of isolated kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "1909"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/mean",
|
|
"name": "CPU Time",
|
|
"description": "Mean isolated kernel execution time (measured on host CPU)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.00026655877475117843"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/cpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated CPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.01817264133840171"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"description": "Mean isolated kernel execution time (measured with CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002619331087848795"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"description": "Relative standard deviation of isolated GPU times",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.004144721519339008"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/item_rate",
|
|
"name": "Elem/s",
|
|
"description": "Number of input elements processed per second",
|
|
"hint": "item_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "32025764283.542324"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"description": "Number of bytes read/written per second to the CUDA device's global memory",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "512412228536.6772"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"description": "Global device memory utilization as a percentage of the device's peak bandwidth",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.6998637299725158"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/cold/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for isolated measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.546077204"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/sample_size",
|
|
"name": "Samples",
|
|
"description": "Number of batch kernel executions",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "2003"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/time/gpu/mean",
|
|
"name": "Batch GPU",
|
|
"description": "Mean batch kernel execution time (measured by CUDA events)",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.0002600450215789597"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tag": "nv/batch/walltime",
|
|
"name": "Walltime",
|
|
"description": "Walltime used for batch measurements",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.520883479"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"is_skipped": false
|
|
},
|
|
{
|
|
"name": "Device=1 In=F64 Out=F64",
|
|
"min_samples": 10,
|
|
"min_time": 0.5,
|
|
"max_noise": 0.005,
|
|
"skip_time": -1.0,
|
|
"timeout": 15.0,
|
|
"device": 1,
|
|
"type_config_index": 35,
|
|
"axis_values": [
|
|
{
|
|
"name": "In",
|
|
"type": "string",
|
|
"value": "F64"
|
|
},
|
|
{
|
|
"name": "Out",
|
|
"type": "string",
|
|
"value": "F64"
|
|
}
|
|
],
|
|
"summaries": null,
|
|
"is_skipped": true,
|
|
"skip_reason": "Not a conversion: InputType == OutputType."
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|