CUTLASS 2.0 (#62)

CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
2026-05-20 21:08:57 +00:00 · 2019-11-19 16:55:34 -08:00
parent b5cab177a9
commit fb335f6a5f
5434 changed files with 599799 additions and 250176 deletions
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+*/
+
+#include <stdexcept>
+
+#include "gpu_timer.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+GpuTimer::GpuTimer() {
+  cudaError_t result;
+
+  for (auto & event : events) {
+    result = cudaEventCreate(&event);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to create CUDA event");
+    }
+  }
+}
+
+GpuTimer::~GpuTimer() {
+  for (auto & event : events) {
+    cudaEventDestroy(event);
+  }
+}
+
+/// Records a start event in the stream
+void GpuTimer::start(cudaStream_t stream) {
+  cudaError_t result = cudaEventRecord(events[0], stream);
+  if (result != cudaSuccess) {
+    throw std::runtime_error("Failed to record start event.");
+  }
+}
+
+/// Records a stop event in the stream
+void GpuTimer::stop(cudaStream_t stream) {
+cudaError_t result = cudaEventRecord(events[1], stream);
+  if (result != cudaSuccess) {
+    throw std::runtime_error("Failed to record stop event.");
+  }
+}
+
+/// Records a stop event in the stream and synchronizes on the stream
+void GpuTimer::stop_and_wait(cudaStream_t stream) {
+
+  stop(stream);
+
+  cudaError_t result;
+  if (stream) {
+    result = cudaStreamSynchronize(stream);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to synchronize with non-null CUDA stream.");
+    }
+  }
+  else {
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to synchronize with CUDA device.");
+    }
+  }
+}
+
+/// Returns the duration in miliseconds
+double GpuTimer::duration(int iterations) const {
+
+  float avg_ms;
+
+  cudaError_t result = cudaEventElapsedTime(&avg_ms, events[0], events[1]);
+  if (result != cudaSuccess) {
+    throw std::runtime_error("Failed to query elapsed time from CUDA events.");
+  }
+
+  return double(avg_ms) / double(iterations);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass