diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu index 8fc193195..886c17701 100644 --- a/examples/04_tile_iterator/tile_iterator.cu +++ b/examples/04_tile_iterator/tile_iterator.cu @@ -50,7 +50,6 @@ #include #include #include -#include // CUTLASS includes #include "cutlass/transform/threadblock/predicated_tile_iterator.h" diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index b73912169..bd74ce12d 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -124,6 +124,7 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index c7dc7b275..9a9dc8888 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -74,7 +74,6 @@ */ #include -#include #include #include "cutlass/cutlass.h" diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index 1dd358464..272390f26 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -72,7 +72,6 @@ */ #include -#include #include #include "cutlass/cutlass.h" diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu index a35a39462..66b0dee50 100644 --- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu +++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -111,6 +111,7 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu index cd2a48d9a..756d46512 100644 --- a/examples/22_quaternion_conv/quaternion_conv.cu +++ b/examples/22_quaternion_conv/quaternion_conv.cu @@ -30,6 +30,7 @@ **************************************************************************************************/ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu index bb880f4fe..41ea3200a 100644 --- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu +++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu @@ -41,6 +41,7 @@ epilogue/threadblock/epilogue_gemm_k_reduction.h */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu index 661efcf7b..fe756fbad 100644 --- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu +++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu @@ -52,6 +52,7 @@ line is the same. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu index da3ec1ca8..72d7284f6 100644 --- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu +++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu @@ -49,6 +49,7 @@ technical details. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu index b2996f2d3..a197e2efc 100644 --- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu +++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu @@ -36,6 +36,7 @@ compared with CUDA Cores. See example 27 for the trick of 3xTF32. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu index b49446cc0..5016adf29 100644 --- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu +++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu @@ -40,6 +40,7 @@ to correctly instantiate the GEMM template. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu index 8ceea6383..f22e235f5 100644 --- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu +++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu @@ -69,6 +69,7 @@ #include #include +#include #include #include diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h index 125c177ae..9f0e04f94 100644 --- a/test/unit/conv/device/conv2d_testbed.h +++ b/test/unit/conv/device/conv2d_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index db2719990..2aa60f0be 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h index 1561ed8ee..dd12bf605 100644 --- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h +++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h @@ -37,6 +37,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h index d2ccc9f1f..a147275bd 100644 --- a/test/unit/conv/device/conv2d_with_reduction_testbed.h +++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index 1c511c194..f9cc3563c 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu index 9373e7da1..4b3feccac 100644 --- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu +++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu index bca6c2472..386f2871f 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu index 880d490ce..84f9110f4 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index 48c8be179..4aa27befe 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu index 696af7b54..5d185dd5e 100644 --- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu index 828e39410..415f7dd73 100644 --- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu index b64901545..922bebc12 100644 --- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu @@ -33,8 +33,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu index 4606b8456..1e9e5c873 100644 --- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu +++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu @@ -35,8 +35,6 @@ #ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu index 7d434eaa6..c33e0a89d 100644 --- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu +++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index 561859a40..e6893026a 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -35,7 +35,6 @@ #pragma once #include -#include #include #include diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h index 2641e8d18..5ec416184 100644 --- a/test/unit/gemm/device/testbed_grouped.h +++ b/test/unit/gemm/device/testbed_grouped.h @@ -36,6 +36,7 @@ #pragma once #include +#include #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h index d7f63c3cf..e39760245 100644 --- a/test/unit/gemm/device/testbed_sanity.h +++ b/test/unit/gemm/device/testbed_sanity.h @@ -33,7 +33,6 @@ */ #include -#include #include #include "../../common/cutlass_unit_test.h" diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h index 8ad85a9eb..fcc136c1a 100644 --- a/test/unit/gemm/device/testbed_splitk.h +++ b/test/unit/gemm/device/testbed_splitk.h @@ -35,7 +35,6 @@ #pragma once #include -#include #include #include "../../common/cutlass_unit_test.h" diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h index a9cf2c624..c8343f8fc 100644 --- a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h +++ b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h @@ -35,6 +35,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h index bdc0d873e..c36e80302 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h @@ -34,6 +34,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h index d0e0e05a7..1d509d5cc 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h @@ -35,6 +35,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h index 59539062d..1d8ef51c3 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h +++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h @@ -34,6 +34,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h index 4ebf58bba..773487eeb 100644 --- a/tools/util/include/cutlass/util/distribution.h +++ b/tools/util/include/cutlass/util/distribution.h @@ -34,7 +34,7 @@ \brief This header contains a class to parametrize a statistical distribution function. */ -#include +#include namespace cutlass {