diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt
index 2a40eae..76b67a1 100644
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -2,12 +2,14 @@ set(srcs
   axes_metadata.cu
   axis_base.cu
   benchmark_base.cu
+  cuda_call.cu
   float64_axis.cu
   int64_axis.cu
   params.cu
+  state.cu
   string_axis.cu
   type_axis.cu
-  state.cu
+
   detail/state_generator.cu
 )
 
diff --git a/nvbench/cuda_call.cu b/nvbench/cuda_call.cu
new file mode 100644
index 0000000..19bce24
--- /dev/null
+++ b/nvbench/cuda_call.cu
@@ -0,0 +1,30 @@
+#include <nvbench/cuda_call.cuh>
+
+#include <stdexcept>
+#include <string>
+
+#include <fmt/format.h>
+
+namespace nvbench
+{
+
+namespace cuda_call
+{
+
+void throw_error(const std::string &filename,
+                 std::size_t lineno,
+                 const std::string &command,
+                 cudaError_t error_code)
+{
+  throw std::runtime_error(fmt::format("{}:{}: Cuda API call returned error: "
+                                       "{}: {}\nCommand: '{}'",
+                                       filename,
+                                       lineno,
+                                       cudaGetErrorName(error_code),
+                                       cudaGetErrorString(error_code),
+                                       command));
+}
+
+} // namespace cuda_call
+
+} // namespace nvbench
diff --git a/nvbench/cuda_call.cuh b/nvbench/cuda_call.cuh
new file mode 100644
index 0000000..42792f0
--- /dev/null
+++ b/nvbench/cuda_call.cuh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <string>
+
+#define NVBENCH_CUDA_CALL(call)                                                \
+  do                                                                           \
+  {                                                                            \
+    const cudaError_t nvbench_cuda_call_error = call;                          \
+    if (nvbench_cuda_call_error != cudaSuccess)                                \
+    {                                                                          \
+      nvbench::cuda_call::throw_error(__FILE__,                                \
+                                      __LINE__,                                \
+                                      #call,                                   \
+                                      nvbench_cuda_call_error);                \
+    }                                                                          \
+  } while (false)
+
+namespace nvbench
+{
+namespace cuda_call
+{
+
+void throw_error(const std::string &filename,
+                 std::size_t lineno,
+                 const std::string &call,
+                 cudaError_t error);
+
+} // namespace cuda_call
+} // namespace nvbench
diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
new file mode 100644
index 0000000..e66b27d
--- /dev/null
+++ b/nvbench/cuda_stream.cuh
@@ -0,0 +1,26 @@
+#include <nvbench/cuda_call.cuh>
+
+#include <cuda_runtime_api.h>
+
+namespace nvbench
+{
+
+// RAII wrapper for a cudaStream_t.
+struct cuda_stream
+{
+  cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
+  ~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
+
+  // move-only
+  cuda_stream(const cuda_stream &) = delete;
+  cuda_stream(cuda_stream &&)      = default;
+  cuda_stream &operator=(const cuda_stream &) = delete;
+  cuda_stream &operator=(cuda_stream &&) = default;
+
+  operator cudaStream_t() { return m_stream; }
+
+private:
+  cudaStream_t m_stream;
+};
+
+} // namespace nvbench
diff --git a/nvbench/cuda_timer.cuh b/nvbench/cuda_timer.cuh
new file mode 100644
index 0000000..52b92ca
--- /dev/null
+++ b/nvbench/cuda_timer.cuh
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <nvbench/cuda_call.cuh>
+
+#include <cuda_runtime_api.h>
+
+namespace nvbench
+{
+
+struct cuda_timer
+{
+  cuda_timer()
+  {
+    NVBENCH_CUDA_CALL(cudaEventCreate(&m_start));
+    NVBENCH_CUDA_CALL(cudaEventCreate(&m_stop));
+  }
+
+  ~cuda_timer()
+  {
+    NVBENCH_CUDA_CALL(cudaEventDestroy(m_start));
+    NVBENCH_CUDA_CALL(cudaEventDestroy(m_stop));
+  }
+
+  // move-only
+  cuda_timer(const cuda_timer &) = delete;
+  cuda_timer(cuda_timer &&)      = default;
+  cuda_timer &operator=(const cuda_timer &) = delete;
+  cuda_timer &operator=(cuda_timer &&) = default;
+
+  void start(cudaStream_t stream)
+  {
+    NVBENCH_CUDA_CALL(cudaEventRecord(m_start, stream));
+  }
+
+  void stop(cudaStream_t stream)
+  {
+    NVBENCH_CUDA_CALL(cudaEventRecord(m_stop, stream));
+  }
+
+  bool ready() const
+  {
+    const cudaError_t state = cudaEventQuery(m_stop);
+    if (state == cudaErrorNotReady)
+    {
+      return false;
+    }
+    NVBENCH_CUDA_CALL(state);
+    return true;
+  }
+
+  double get_duration() const
+  {
+    NVBENCH_CUDA_CALL(cudaEventSynchronize(m_stop));
+    float elapsed_time;
+    // According to docs, this is in ms with a resolution of ~0.5 microseconds.
+    NVBENCH_CUDA_CALL(cudaEventElapsedTime(&elapsed_time, m_start, m_stop));
+    return elapsed_time / 1000.0;
+  }
+
+private:
+  cudaEvent_t m_start;
+  cudaEvent_t m_stop;
+};
+
+} // namespace nvbench
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index fb01968..7e9218d 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -1,8 +1,9 @@
 set(test_srcs
   axes_metadata.cu
   benchmark.cu
-  int64_axis.cu
+  cuda_timer.cu
   float64_axis.cu
+  int64_axis.cu
   params.cu
   state.cu
   state_generator.cu
diff --git a/testing/cuda_timer.cu b/testing/cuda_timer.cu
new file mode 100644
index 0000000..751bcd8
--- /dev/null
+++ b/testing/cuda_timer.cu
@@ -0,0 +1,58 @@
+#include <nvbench/cuda_timer.cuh>
+
+#include <nvbench/cuda_stream.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+#include <fmt/format.h>
+
+#include <cuda/std/chrono>
+
+#include <cuda_runtime.h>
+
+__global__ void sleep_kernel(double seconds)
+{
+  const auto start = cuda::std::chrono::high_resolution_clock::now();
+  const auto ns    = cuda::std::chrono::nanoseconds(
+    static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
+  const auto finish = start + ns;
+
+  auto now = cuda::std::chrono::high_resolution_clock::now();
+  while (now < finish)
+  {
+    now = cuda::std::chrono::high_resolution_clock::now();
+  }
+}
+
+void test_basic(cudaStream_t time_stream,
+                cudaStream_t exec_stream,
+                bool expected)
+{
+  nvbench::cuda_timer timer;
+
+  NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
+
+  timer.start(time_stream);
+  sleep_kernel<<<1, 1, 0, exec_stream>>>(0.25);
+  timer.stop(time_stream);
+
+  NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
+  const bool captured = timer.get_duration() > 0.25;
+  ASSERT_MSG(captured == expected,
+             fmt::format("Unexpected result from timer: {} seconds "
+                         " (expected {})",
+                         timer.get_duration(),
+                         (expected ? "> 0.25s" : "< 0.25s")));
+}
+
+void test_basic()
+{
+  nvbench::cuda_stream stream1;
+  nvbench::cuda_stream stream2;
+
+  test_basic(stream1, stream1, true);
+  test_basic(stream1, stream2, false);
+}
+
+int main() { test_basic(); }