Build multi architecture cuda wheels (#302)

* Add cuda architectures to build wheel for * Package scripts in wheel * Separate cuda major version extraction to fix architecutre selection logic * Add back statement printing cuda version * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-06-05 20:53:58 +00:00 · 2026-01-28 19:13:24 -06:00
parent a681e2185d
commit 5e7adc5c3f
13 changed files with 55 additions and 7 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -43,6 +43,20 @@ test-cu13 = ["pynvbench[cu13]", "pytest", "cupy-cuda13x", "numba"]

 # Generic test dependencies (defaults to CUDA 12)
 test = ["pytest", "cupy-cuda12x", "numba"]
+tools = [
+  "colorama",
+  "jsondiff",
+  "matplotlib",
+  "numpy",
+  "pandas",
+  "seaborn",
+  "tabulate",
+]
+
+[project.scripts]
+nvbench-compare = "scripts.nvbench_compare:main"
+nvbench-histogram = "scripts.nvbench_histogram:main"
+nvbench-walltime = "scripts.nvbench_walltime:main"

 [project.urls]
 Homepage = "https://github.com/NVIDIA/nvbench"
@@ -75,3 +89,4 @@ fallback_version = "0.0.0"
 [tool.scikit-build.wheel.packages]
 "cuda" = "cuda"
 "cuda/bench" = "cuda/bench"
+"scripts" = "scripts"
--- a/python/scripts/init.py
+++ b/python/scripts/init.py
@@ -0,0 +1 @@
+# Package placeholder for nvbench CLI tools.
--- a/python/scripts/nvbench_compare.py
+++ b/python/scripts/nvbench_compare.py
@@ -0,0 +1,429 @@
+#!/usr/bin/env python
+
+import argparse
+import math
+import os
+import sys
+
+import jsondiff
+import tabulate
+from colorama import Fore
+
+try:
+    from nvbench_json import reader
+except ImportError:
+    from scripts.nvbench_json import reader
+
+
+# Parse version string into tuple, "x.y.z" -> (x, y, z)
+def version_tuple(v):
+    return tuple(map(int, (v.split("."))))
+
+
+tabulate_version = version_tuple(tabulate.__version__)
+
+all_ref_devices = []
+all_cmp_devices = []
+config_count = 0
+unknown_count = 0
+failure_count = 0
+pass_count = 0
+
+
+def find_matching_bench(needle, haystack):
+    for hay in haystack:
+        if hay["name"] == needle["name"] and hay["axes"] == needle["axes"]:
+            return hay
+    return None
+
+
+def find_device_by_id(device_id, all_devices):
+    for device in all_devices:
+        if device["id"] == device_id:
+            return device
+    return None
+
+
+def format_int64_axis_value(axis_name, axis_value, axes):
+    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
+    axis_flags = axis["flags"]
+    value = int(axis_value["value"])
+    if axis_flags == "pow2":
+        value = math.log2(value)
+        return "2^%d" % value
+    return "%d" % value
+
+
+def format_float64_axis_value(axis_name, axis_value, axes):
+    return "%.5g" % float(axis_value["value"])
+
+
+def format_type_axis_value(axis_name, axis_value, axes):
+    return "%s" % axis_value["value"]
+
+
+def format_string_axis_value(axis_name, axis_value, axes):
+    return "%s" % axis_value["value"]
+
+
+def format_axis_value(axis_name, axis_value, axes):
+    axis = next(filter(lambda ax: ax["name"] == axis_name, axes))
+    axis_type = axis["type"]
+    if axis_type == "int64":
+        return format_int64_axis_value(axis_name, axis_value, axes)
+    elif axis_type == "float64":
+        return format_float64_axis_value(axis_name, axis_value, axes)
+    elif axis_type == "type":
+        return format_type_axis_value(axis_name, axis_value, axes)
+    elif axis_type == "string":
+        return format_string_axis_value(axis_name, axis_value, axes)
+
+
+def format_duration(seconds):
+    if seconds >= 1:
+        multiplier = 1.0
+        units = "s"
+    elif seconds >= 1e-3:
+        multiplier = 1e3
+        units = "ms"
+    elif seconds >= 1e-6:
+        multiplier = 1e6
+        units = "us"
+    else:
+        multiplier = 1e6
+        units = "us"
+    return "%0.3f %s" % (seconds * multiplier, units)
+
+
+def format_percentage(percentage):
+    # When there aren't enough samples for a meaningful noise measurement,
+    # the noise is recorded as infinity. Unfortunately, JSON spec doesn't
+    # allow for inf, so these get turned into null.
+    if percentage is None:
+        return "inf"
+    return "%0.2f%%" % (percentage * 100.0)
+
+
+def compare_benches(ref_benches, cmp_benches, threshold, plot):
+    if plot:
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+
+        sns.set()
+
+    for cmp_bench in cmp_benches:
+        ref_bench = find_matching_bench(cmp_bench, ref_benches)
+        if not ref_bench:
+            continue
+
+        print("# %s\n" % (cmp_bench["name"]))
+
+        cmp_device_ids = cmp_bench["devices"]
+        axes = cmp_bench["axes"]
+        ref_states = ref_bench["states"]
+        cmp_states = cmp_bench["states"]
+
+        axes = axes if axes else []
+
+        headers = [x["name"] for x in axes]
+        colalign = ["center"] * len(headers)
+
+        headers.append("Ref Time")
+        colalign.append("right")
+        headers.append("Ref Noise")
+        colalign.append("right")
+        headers.append("Cmp Time")
+        colalign.append("right")
+        headers.append("Cmp Noise")
+        colalign.append("right")
+        headers.append("Diff")
+        colalign.append("right")
+        headers.append("%Diff")
+        colalign.append("right")
+        headers.append("Status")
+        colalign.append("center")
+
+        for cmp_device_id in cmp_device_ids:
+            rows = []
+            plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}
+
+            for cmp_state in cmp_states:
+                cmp_state_name = cmp_state["name"]
+                ref_state = next(
+                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
+                )
+                if not ref_state:
+                    continue
+
+                axis_values = cmp_state["axis_values"]
+                if not axis_values:
+                    axis_values = []
+
+                row = []
+                for axis_value in axis_values:
+                    axis_value_name = axis_value["name"]
+                    row.append(format_axis_value(axis_value_name, axis_value, axes))
+
+                cmp_summaries = cmp_state["summaries"]
+                ref_summaries = ref_state["summaries"]
+
+                if not ref_summaries or not cmp_summaries:
+                    continue
+
+                def lookup_summary(summaries, tag):
+                    return next(filter(lambda s: s["tag"] == tag, summaries), None)
+
+                cmp_time_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/mean"
+                )
+                ref_time_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/mean"
+                )
+                cmp_noise_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
+                )
+                ref_noise_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
+                )
+
+                # TODO: Use other timings, too. Maybe multiple rows, with a
+                # "Timing" column + values "CPU/GPU/Batch"?
+                if not all(
+                    [
+                        cmp_time_summary,
+                        ref_time_summary,
+                        cmp_noise_summary,
+                        ref_noise_summary,
+                    ]
+                ):
+                    continue
+
+                def extract_value(summary):
+                    summary_data = summary["data"]
+                    value_data = next(
+                        filter(lambda v: v["name"] == "value", summary_data)
+                    )
+                    assert value_data["type"] == "float64"
+                    return value_data["value"]
+
+                cmp_time = extract_value(cmp_time_summary)
+                ref_time = extract_value(ref_time_summary)
+                cmp_noise = extract_value(cmp_noise_summary)
+                ref_noise = extract_value(ref_noise_summary)
+
+                # Convert string encoding to expected numerics:
+                cmp_time = float(cmp_time)
+                ref_time = float(ref_time)
+
+                diff = cmp_time - ref_time
+                frac_diff = diff / ref_time
+
+                if ref_noise and cmp_noise:
+                    ref_noise = float(ref_noise)
+                    cmp_noise = float(cmp_noise)
+                    min_noise = min(ref_noise, cmp_noise)
+                elif ref_noise:
+                    ref_noise = float(ref_noise)
+                    min_noise = ref_noise
+                elif cmp_noise:
+                    cmp_noise = float(cmp_noise)
+                    min_noise = cmp_noise
+                else:
+                    min_noise = None  # Noise is inf
+
+                if plot:
+                    axis_name = []
+                    axis_value = "--"
+                    for aid in range(len(axis_values)):
+                        if axis_values[aid]["name"] != plot:
+                            axis_name.append(
+                                "{} = {}".format(
+                                    axis_values[aid]["name"], axis_values[aid]["value"]
+                                )
+                            )
+                        else:
+                            axis_value = float(axis_values[aid]["value"])
+                    axis_name = ", ".join(axis_name)
+
+                    if axis_name not in plot_data["cmp"]:
+                        plot_data["cmp"][axis_name] = {}
+                        plot_data["ref"][axis_name] = {}
+                        plot_data["cmp_noise"][axis_name] = {}
+                        plot_data["ref_noise"][axis_name] = {}
+
+                    plot_data["cmp"][axis_name][axis_value] = cmp_time
+                    plot_data["ref"][axis_name][axis_value] = ref_time
+                    plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise
+                    plot_data["ref_noise"][axis_name][axis_value] = ref_noise
+
+                global config_count
+                global unknown_count
+                global pass_count
+                global failure_count
+
+                config_count += 1
+                if not min_noise:
+                    unknown_count += 1
+                    status = Fore.YELLOW + "????" + Fore.RESET
+                elif abs(frac_diff) <= min_noise:
+                    pass_count += 1
+                    status = Fore.BLUE + "SAME" + Fore.RESET
+                elif diff < 0:
+                    failure_count += 1
+                    status = Fore.GREEN + "FAST" + Fore.RESET
+                else:
+                    failure_count += 1
+                    status = Fore.RED + "SLOW" + Fore.RESET
+
+                if abs(frac_diff) >= threshold:
+                    row.append(format_duration(ref_time))
+                    row.append(format_percentage(ref_noise))
+                    row.append(format_duration(cmp_time))
+                    row.append(format_percentage(cmp_noise))
+                    row.append(format_duration(diff))
+                    row.append(format_percentage(frac_diff))
+                    row.append(status)
+
+                    rows.append(row)
+
+            if len(rows) == 0:
+                continue
+
+            cmp_device = find_device_by_id(cmp_device_id, all_cmp_devices)
+            ref_device = find_device_by_id(ref_state["device"], all_ref_devices)
+
+            if cmp_device == ref_device:
+                print("## [%d] %s\n" % (cmp_device["id"], cmp_device["name"]))
+            else:
+                print(
+                    "## [%d] %s vs. [%d] %s\n"
+                    % (
+                        ref_device["id"],
+                        ref_device["name"],
+                        cmp_device["id"],
+                        cmp_device["name"],
+                    )
+                )
+            # colalign and github format require tabulate 0.8.3
+            if tabulate_version >= (0, 8, 3):
+                print(
+                    tabulate.tabulate(
+                        rows, headers=headers, colalign=colalign, tablefmt="github"
+                    )
+                )
+            else:
+                print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
+
+            print("")
+
+            if plot:
+                plt.xscale("log")
+                plt.yscale("log")
+                plt.xlabel(plot)
+                plt.ylabel("time [s]")
+                plt.title(cmp_device["name"])
+
+                def plot_line(key, shape, label):
+                    x = [float(x) for x in plot_data[key][axis].keys()]
+                    y = list(plot_data[key][axis].values())
+
+                    noise = list(plot_data[key + "_noise"][axis].values())
+
+                    top = [y[i] + y[i] * noise[i] for i in range(len(x))]
+                    bottom = [y[i] - y[i] * noise[i] for i in range(len(x))]
+
+                    p = plt.plot(x, y, shape, marker="o", label=label)
+                    plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1)
+
+                for axis in plot_data["cmp"].keys():
+                    plot_line("cmp", "-", axis)
+                    plot_line("ref", "--", axis + " ref")
+
+                plt.legend()
+                plt.show()
+
+
+def main():
+    help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
+    parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text)
+    parser.add_argument(
+        "--ignore-devices",
+        dest="ignore_devices",
+        default=False,
+        help="Ignore differences in the device sections and compare anyway",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--threshold-diff",
+        type=float,
+        dest="threshold",
+        default=0.0,
+        help="only show benchmarks where percentage diff is >= THRESHOLD",
+    )
+    parser.add_argument(
+        "--plot-along", type=str, dest="plot", default=None, help="plot results"
+    )
+
+    args, files_or_dirs = parser.parse_known_args()
+    print(files_or_dirs)
+
+    if len(files_or_dirs) != 2:
+        parser.print_help()
+        sys.exit(1)
+
+    # if provided two directories, find all the exactly named files
+    # in both and treat them as the reference and compare
+    to_compare = []
+    if os.path.isdir(files_or_dirs[0]) and os.path.isdir(files_or_dirs[1]):
+        for f in os.listdir(files_or_dirs[1]):
+            if os.path.splitext(f)[1] != ".json":
+                continue
+            r = os.path.join(files_or_dirs[0], f)
+            c = os.path.join(files_or_dirs[1], f)
+            if (
+                os.path.isfile(r)
+                and os.path.isfile(c)
+                and os.path.getsize(r) > 0
+                and os.path.getsize(c) > 0
+            ):
+                to_compare.append((r, c))
+    else:
+        to_compare = [(files_or_dirs[0], files_or_dirs[1])]
+
+    for ref, comp in to_compare:
+        ref_root = reader.read_file(ref)
+        cmp_root = reader.read_file(comp)
+
+        global all_ref_devices
+        global all_cmp_devices
+        all_ref_devices = ref_root["devices"]
+        all_cmp_devices = cmp_root["devices"]
+
+        if ref_root["devices"] != cmp_root["devices"]:
+            print(
+                (Fore.YELLOW if args.ignore_devices else Fore.RED)
+                + "Device sections do not match:"
+                + Fore.RESET
+            )
+            print(
+                jsondiff.diff(
+                    ref_root["devices"], cmp_root["devices"], syntax="symmetric"
+                )
+            )
+            if not args.ignore_devices:
+                sys.exit(1)
+
+        compare_benches(
+            ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot
+        )
+
+    print("# Summary\n")
+    print("- Total Matches: %d" % config_count)
+    print("  - Pass    (diff <= min_noise): %d" % pass_count)
+    print("  - Unknown (infinite noise):    %d" % unknown_count)
+    print("  - Failure (diff > min_noise):  %d" % failure_count)
+    return failure_count
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/python/scripts/nvbench_histogram.py
+++ b/python/scripts/nvbench_histogram.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+try:
+    from nvbench_json import reader
+except ImportError:
+    from scripts.nvbench_json import reader
+
+
+def parse_files():
+    help_text = "%(prog)s [nvbench.out.json | dir/] ..."
+    parser = argparse.ArgumentParser(prog="nvbench_histogram", usage=help_text)
+
+    args, files_or_dirs = parser.parse_known_args()
+
+    filenames = []
+    for file_or_dir in files_or_dirs:
+        if os.path.isdir(file_or_dir):
+            for f in os.listdir(file_or_dir):
+                if os.path.splitext(f)[1] != ".json":
+                    continue
+                filename = os.path.join(file_or_dir, f)
+                if os.path.isfile(filename) and os.path.getsize(filename) > 0:
+                    filenames.append(filename)
+        else:
+            filenames.append(file_or_dir)
+
+    filenames.sort()
+
+    if not filenames:
+        parser.print_help()
+        exit(0)
+
+    return filenames
+
+
+def extract_filename(summary):
+    summary_data = summary["data"]
+    value_data = next(filter(lambda v: v["name"] == "filename", summary_data))
+    assert value_data["type"] == "string"
+    return value_data["value"]
+
+
+def extract_size(summary):
+    summary_data = summary["data"]
+    value_data = next(filter(lambda v: v["name"] == "size", summary_data))
+    assert value_data["type"] == "int64"
+    return int(value_data["value"])
+
+
+def parse_samples_meta(filename, state):
+    summaries = state["summaries"]
+    if not summaries:
+        return None, None
+
+    summary = next(
+        filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times", summaries),
+        None,
+    )
+    if not summary:
+        return None, None
+
+    sample_filename = extract_filename(summary)
+
+    # If not absolute, the path is relative to the associated .json file:
+    if not os.path.isabs(sample_filename):
+        sample_filename = os.path.join(os.path.dirname(filename), sample_filename)
+
+    sample_count = extract_size(summary)
+    return sample_count, sample_filename
+
+
+def parse_samples(filename, state):
+    sample_count, samples_filename = parse_samples_meta(filename, state)
+    if not sample_count or not samples_filename:
+        return []
+
+    with open(samples_filename, "rb") as f:
+        samples = np.fromfile(f, "<f4")
+
+    assert sample_count == len(samples)
+    return samples
+
+
+def to_df(data):
+    return pd.DataFrame.from_dict(dict([(k, pd.Series(v)) for k, v in data.items()]))
+
+
+def parse_json(filename):
+    json_root = reader.read_file(filename)
+
+    samples_data = {}
+
+    for bench in json_root["benchmarks"]:
+        print("Benchmark: {}".format(bench["name"]))
+        for state in bench["states"]:
+            print("State: {}".format(state["name"]))
+
+            samples = parse_samples(filename, state)
+            if len(samples) == 0:
+                continue
+
+            samples_data["{} {}".format(bench["name"], state["name"])] = samples
+
+    return to_df(samples_data)
+
+
+def main():
+    filenames = parse_files()
+
+    dfs = [parse_json(filename) for filename in filenames]
+    df = pd.concat(dfs, ignore_index=True)
+
+    sns.displot(df, rug=True, kind="kde", fill=True)
+    plt.show()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/python/scripts/nvbench_json/init.py
+++ b/python/scripts/nvbench_json/init.py
@@ -0,0 +1,3 @@
+from . import reader, version
+
+__all__ = ["reader", "version"]
--- a/python/scripts/nvbench_json/reader.py
+++ b/python/scripts/nvbench_json/reader.py
@@ -0,0 +1,10 @@
+import json
+
+from . import version
+
+
+def read_file(filename):
+    with open(filename, "r") as f:
+        file_root = json.load(f)
+    version.check_file_version(filename, file_root)
+    return file_root
--- a/python/scripts/nvbench_json/version.py
+++ b/python/scripts/nvbench_json/version.py
@@ -0,0 +1,32 @@
+file_version = (1, 0, 0)
+
+file_version_string = "{}.{}.{}".format(
+    file_version[0], file_version[1], file_version[2]
+)
+
+
+def check_file_version(filename, root_node):
+    try:
+        version_node = root_node["meta"]["version"]["json"]
+    except KeyError:
+        print("WARNING:")
+        print("  {} is written in an older, unversioned format. ".format(filename))
+        print("  It may not read correctly.")
+        print("  Reader expects JSON file version {}.".format(file_version_string))
+        return
+
+    # TODO We could do something fancy here using semantic versioning, but
+    # for now just warn on mismatch.
+    if version_node["string"] != file_version_string:
+        print("WARNING:")
+        print(
+            "  {} was written using a different NVBench JSON file version.".format(
+                filename
+            )
+        )
+        print("  It may not read correctly.")
+        print(
+            "  (file version: {} reader version: {})".format(
+                version_node["string"], file_version_string
+            )
+        )
--- a/python/scripts/nvbench_walltime.py
+++ b/python/scripts/nvbench_walltime.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python
+
+import argparse
+import math
+import os
+import sys
+
+import tabulate
+
+try:
+    from nvbench_json import reader
+except ImportError:
+    from scripts.nvbench_json import reader
+
+
+# Parse version string into tuple, "x.y.z" -> (x, y, z)
+def version_tuple(v):
+    return tuple(map(int, (v.split("."))))
+
+
+tabulate_version = version_tuple(tabulate.__version__)
+
+all_devices = []
+
+
+def format_axis_value(axis_value, axis_type):
+    if axis_type == "int64":
+        return "%d" % int(axis_value)
+    elif axis_type == "float64":
+        return "%.5g" % float(axis_value)
+    else:
+        return axis_value
+
+
+def format_walltime(seconds_in):
+    h = math.floor(seconds_in / (60 * 60))
+    m = math.floor((seconds_in / 60) % 60)
+    s = math.floor(seconds_in % 60)
+    ms = math.floor((seconds_in * 1000) % 1000)
+
+    return "{}{}{}{}".format(
+        "{:0>2d}:".format(h) if h > 1e-9 else "",
+        "{:0>2d}:".format(m) if (h > 1e-9 or m > 1e-9) else "",
+        "{:0>2d}.".format(s) if (h > 1e-9 or m > 1e-9) else "{:d}.".format(s),
+        "{:0>3d}".format(ms),
+    )
+
+
+def format_percentage(percentage):
+    # When there aren't enough samples for a meaningful noise measurement,
+    # the noise is recorded as infinity. Unfortunately, JSON spec doesn't
+    # allow for inf, so these get turned into null.
+    if percentage is None:
+        return "inf"
+    return "%0.2f%%" % (percentage * 100.0)
+
+
+measure_names = ["cold", "batch", "cupti"]
+measure_column_names = {"cold": "Isolated", "batch": "Batch", "cupti": "CUPTI"}
+
+
+def init_measures():
+    out = {}
+    for name in measure_names:
+        out[name] = 0.0
+    return out
+
+
+def get_measures(state):
+    summaries = state["summaries"]
+    times = {}
+    for name in measure_names:
+        measure_walltime_tag = "nv/{}/walltime".format(name)
+        summary = next(
+            filter(lambda s: s["tag"] == measure_walltime_tag, summaries), None
+        )
+        if not summary:
+            continue
+
+        walltime_data = next(filter(lambda d: d["name"] == "value", summary["data"]))
+        assert walltime_data["type"] == "float64"
+        walltime = walltime_data["value"]
+        walltime = float(walltime)
+        times[name] = walltime if walltime else 0.0
+    return times
+
+
+def merge_measures(target, src):
+    for name, src_val in src.items():
+        target[name] += src_val
+
+
+def sum_measures(measures):
+    total_time = 0.0
+    for time in measures.values():
+        total_time += time
+    return total_time
+
+
+def get_active_measure_names(measures):
+    names = []
+    for name, time in measures.items():
+        if time > 1e-9:
+            names.append(name)
+    return names
+
+
+def append_measure_headers(headers, active=measure_names):
+    for name in active:
+        headers.append(measure_column_names[name])
+
+
+def append_measure_values(row, measures, active=measure_names):
+    for name in active:
+        row.append(format_walltime(measures[name]))
+
+
+def consume_file(filename):
+    file_root = reader.read_file(filename)
+
+    file_out = {}
+    file_measures = init_measures()
+
+    benches = {}
+    for bench in file_root["benchmarks"]:
+        bench_data = consume_benchmark(bench, file_root)
+        merge_measures(file_measures, bench_data["measures"])
+        benches[bench["name"]] = bench_data
+
+    file_out["benches"] = benches
+    file_out["measures"] = file_measures
+    return file_out
+
+
+def consume_benchmark(bench, file_root):
+    bench_out = {}
+
+    # Initialize axis map
+    axes_out = {}
+    axes = bench["axes"]
+    if axes:
+        for axis in axes:
+            values_out = {}
+            axis_name = axis["name"]
+            axis_type = axis["type"]
+            for value in axis["values"]:
+                if axis_type == "type":
+                    value = value["input_string"]
+                else:
+                    value = format_axis_value(value["value"], axis_type)
+                values_out[value] = {"measures": init_measures()}
+            axes_out[axis_name] = values_out
+
+    states_out = {}
+    bench_measures = init_measures()
+
+    for state in bench["states"]:
+        state_name = state["name"]
+        # Get walltimes for each measurement:
+        state_measures = get_measures(state)
+        state_out = {}
+        state_out["measures"] = state_measures
+        states_out[state_name] = state_out
+
+        # Update the benchmark measures walltimes
+        merge_measures(bench_measures, state_measures)
+
+        # Update the axis measurements:
+        axis_values = state["axis_values"]
+        if axis_values:
+            for axis_value in axis_values:
+                axis_name = axis_value["name"]
+                value = format_axis_value(axis_value["value"], axis_value["type"])
+                merge_measures(axes_out[axis_name][value]["measures"], state_measures)
+
+    bench_out["axes"] = axes_out
+    bench_out["measures"] = bench_measures
+    bench_out["states"] = states_out
+    return bench_out
+
+
+def print_overview_section(data):
+    print("# Walltime Overview\n")
+
+    measures = data["measures"]
+    active_measures = get_active_measure_names(measures)
+
+    headers = ["Walltime"]
+    append_measure_headers(headers, active_measures)
+
+    colalign = ["right"] * len(headers)
+
+    rows = []
+
+    row = [format_walltime(sum_measures(measures))]
+    append_measure_values(row, measures, active_measures)
+    rows.append(row)
+
+    # colalign and github format require tabulate 0.8.3
+    if tabulate_version >= (0, 8, 3):
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
+    else:
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
+
+    print()
+
+
+# append_data_row_lambda args: (row_list, name, items[name])
+def print_measures_table(
+    headers, colalign, items, total_measures, append_item_row_lambda
+):
+    total_time = sum_measures(total_measures)
+    active_measures = get_active_measure_names(total_measures)
+    num_user_columns = len(headers)
+
+    headers.append("%")
+    headers.append("Walltime")
+    append_measure_headers(headers, active_measures)
+
+    while len(colalign) < len(headers):
+        colalign.append("right")
+
+    rows = []
+
+    for name, item in items.items():
+        item_measures = item["measures"]
+        item_time = sum_measures(item_measures)
+
+        row = []
+        append_item_row_lambda(row, name, item)
+        if total_time > 1e-9:
+            row.append(format_percentage(item_time / total_time))
+        else:
+            row.append(format_percentage(0))
+        row.append(format_walltime(item_time))
+        append_measure_values(row, item_measures, active_measures)
+        rows.append(row)
+
+    # Totals:
+    row = []
+    if num_user_columns != 0:
+        row.append("Total")
+    while len(row) < num_user_columns:
+        row.append("")
+    row.append(format_percentage(1))
+    row.append(format_walltime(total_time))
+    append_measure_values(row, total_measures, active_measures)
+    rows.append(row)
+
+    # colalign and github format require tabulate 0.8.3
+    if tabulate_version >= (0, 8, 3):
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
+    else:
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
+
+
+def print_files_section(data):
+    print("# Files\n")
+
+    items = data["files"]
+    total_measures = data["measures"]
+    headers = ["Filename"]
+    colalign = ["left"]
+
+    def append_row(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row)
+    print()
+
+    for filename, file in items.items():
+        print_file_section(filename, file)
+
+
+def print_file_section(filename, file):
+    print("## File: {}\n".format(filename))
+
+    items = file["benches"]
+    total_measures = file["measures"]
+    headers = ["Benchmark"]
+    colalign = ["left"]
+
+    def append_row_name(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row_name)
+    print()
+
+    for bench_name, bench in items.items():
+        print_bench_section(bench_name, bench)
+
+
+def print_bench_section(bench_name, bench):
+    print("### Benchmark: {}\n".format(bench_name))
+
+    # TODO split this up so each axis is a column
+    items = bench["states"]
+    total_measures = bench["measures"]
+    headers = ["Configuration"]
+    colalign = ["left"]
+
+    def append_row_name(row, name, item):
+        row.append(name)
+
+    print_measures_table(headers, colalign, items, total_measures, append_row_name)
+    print()
+
+    for axis_name, axis in bench["axes"].items():
+        total_measures = bench["measures"]
+        headers = ["Axis: " + axis_name]
+        colalign = ["left"]
+        print_measures_table(headers, colalign, axis, total_measures, append_row_name)
+        print()
+
+
+def main():
+    help_text = "%(prog)s [nvbench.out.json | dir/]..."
+    parser = argparse.ArgumentParser(prog="nvbench_walltime", usage=help_text)
+
+    args, files_or_dirs = parser.parse_known_args()
+
+    filenames = []
+    for file_or_dir in files_or_dirs:
+        if os.path.isdir(file_or_dir):
+            for f in os.listdir(file_or_dir):
+                if os.path.splitext(f)[1] != ".json":
+                    continue
+                filename = os.path.join(file_or_dir, f)
+                if os.path.isfile(filename) and os.path.getsize(filename) > 0:
+                    filenames.append(filename)
+        else:
+            filenames.append(file_or_dir)
+
+    filenames.sort()
+
+    data = {}
+
+    files_out = {}
+    measures = init_measures()
+    for filename in filenames:
+        file_data = consume_file(filename)
+        merge_measures(measures, file_data["measures"])
+        files_out[filename] = file_data
+
+    data["files"] = files_out
+    data["measures"] = measures
+
+    print_overview_section(data)
+    print_files_section(data)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/python/scripts/test_cmp.json
+++ b/python/scripts/test_cmp.json
--- a/python/scripts/test_ref.json
+++ b/python/scripts/test_ref.json
				`@@ -0,0 +1 @@`
				`# Package placeholder for nvbench CLI tools.`